summary refs log tree commit diff
diff options
context:
space:
mode:
authorDavid Hook <dgh@cryptoworkshop.com>2022-07-24 19:47:12 +1000
committerDavid Hook <dgh@cryptoworkshop.com>2022-07-24 19:47:12 +1000
commitd9deb674505071c39e342aa121ed3cc65cac9f2a (patch)
tree2e9dfc5d8ec9c01d88cf5085137fef4a982e66ae
parentmoved ntrup to ntruprime, reduced accessors on keys, marked key fields internal (diff)
parentChaCha20Poly1305 perf. opts. (diff)
downloadBouncyCastle.NET-ed25519-d9deb674505071c39e342aa121ed3cc65cac9f2a.tar.xz
Merge remote-tracking branch 'refs/remotes/origin/master'
-rw-r--r--crypto/Contributors.html3
-rw-r--r--crypto/src/crypto/engines/ChaCha7539Engine.cs405
-rw-r--r--crypto/src/crypto/engines/ChaChaEngine.cs245
-rw-r--r--crypto/src/crypto/engines/Salsa20Engine.cs12
-rw-r--r--crypto/src/crypto/macs/Poly1305.cs176
-rw-r--r--crypto/src/crypto/modes/ChaCha20Poly1305.cs130
-rw-r--r--crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs4
7 files changed, 763 insertions, 212 deletions
diff --git a/crypto/Contributors.html b/crypto/Contributors.html
index 388b023fd..1a7479003 100644
--- a/crypto/Contributors.html
+++ b/crypto/Contributors.html
@@ -36,7 +36,8 @@
 		<p>Contributors - Organisations.<p>
 		<ul>
 <li>
-Monash University, Cyber Security Lab, under the supervision of A. Prof. Ron Steinfeld, Dr. Amin Sakzad, and Dr. Raymond K. Zhao for contributions to the NIST post-quantum algorithm set. Initial NTRU Prime implementation: Yuki Kume.
+Monash University, Cyber Security Lab, under the supervision of A. Prof. Ron Steinfeld, Dr. Amin Sakzad, and Dr. Raymond K. Zhao
+for contributions to the NIST post-quantum algorithm set. Initial NTRU Prime implementation: Yuki Kume.
 </li>
 		</ul>
 		<p>Contributors - People</p>
diff --git a/crypto/src/crypto/engines/ChaCha7539Engine.cs b/crypto/src/crypto/engines/ChaCha7539Engine.cs
index 206416a98..81e97478b 100644
--- a/crypto/src/crypto/engines/ChaCha7539Engine.cs
+++ b/crypto/src/crypto/engines/ChaCha7539Engine.cs
@@ -1,4 +1,12 @@
 using System;
+using System.Diagnostics;
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+using System.Runtime.CompilerServices;
+#endif
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 using Org.BouncyCastle.Crypto.Utilities;
 
@@ -58,9 +66,398 @@ namespace Org.BouncyCastle.Crypto.Engines
 
         protected override void GenerateKeyStream(byte[] output)
         {
-            ChaChaEngine.ChachaCore(rounds, engineState, x);
-            Pack.UInt32_To_LE(x, output, 0);
+            ChaChaEngine.ChachaCore(rounds, engineState, output);
         }
-    }
-}
 
+		internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff)
+		{
+			if (!initialised)
+				throw new InvalidOperationException(AlgorithmName + " not initialised");
+			if (index != 0)
+				throw new InvalidOperationException(AlgorithmName + " not in block-aligned state");
+
+			Check.DataLength(inBuf, inOff, inLen, "input buffer too short");
+			Check.OutputLength(outBuf, outOff, inLen, "output buffer too short");
+
+			while (inLen >= 128)
+            {
+				ProcessBlocks2(inBuf, inOff, outBuf, outOff);
+				inOff += 128;
+				inLen -= 128;
+				outOff += 128;
+			}
+
+			if (inLen >= 64)
+			{
+				ImplProcessBlock(inBuf, inOff, outBuf, outOff);
+				inOff += 64;
+				inLen -= 64;
+				outOff += 64;
+			}
+
+			if (inLen > 0)
+            {
+                GenerateKeyStream(keyStream);
+                AdvanceCounter();
+
+				for (int i = 0; i < inLen; ++i)
+                {
+                    outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]);
+                }
+			}
+
+			engineState[12] = 0;
+
+			// TODO Prevent re-use if encrypting
+		}
+
+		internal void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+        {
+            if (!initialised)
+                throw new InvalidOperationException(AlgorithmName + " not initialised");
+            if (LimitExceeded(64U))
+                throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
+
+            Debug.Assert(index == 0);
+
+			ImplProcessBlock(inBytes, inOff, outBytes, outOff);
+        }
+
+        internal void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+        {
+            if (!initialised)
+                throw new InvalidOperationException(AlgorithmName + " not initialised");
+            if (LimitExceeded(128U))
+                throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
+
+            Debug.Assert(index == 0);
+
+#if NETCOREAPP3_0_OR_GREATER
+            if (Avx2.IsSupported)
+            {
+                ImplProcessBlocks2_X86_Avx2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff));
+                return;
+            }
+
+            if (Sse2.IsSupported)
+            {
+                ImplProcessBlocks2_X86_Sse2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff));
+                return;
+            }
+#endif
+
+            {
+				ImplProcessBlock(inBytes, inOff, outBytes, outOff);
+				ImplProcessBlock(inBytes, inOff + 64, outBytes, outOff + 64);
+			}
+		}
+
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+#endif
+		internal void ImplProcessBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
+        {
+			ChaChaEngine.ChachaCore(rounds, engineState, keyStream);
+			AdvanceCounter();
+
+			for (int i = 0; i < 64; ++i)
+			{
+				outBuf[outOff + i] = (byte)(keyStream[i] ^ inBuf[inOff + i]);
+			}
+		}
+
+#if NETCOREAPP3_0_OR_GREATER
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, Span<byte> input, Span<byte> output)
+		{
+			if (!Avx2.IsSupported)
+				throw new PlatformNotSupportedException();
+
+			Debug.Assert(rounds % 2 == 0);
+			Debug.Assert(state.Length >= 16);
+			Debug.Assert(input.Length >= 128);
+			Debug.Assert(output.Length >= 128);
+
+			var t0 = Load128_UInt32(state.AsSpan());
+			var t1 = Load128_UInt32(state.AsSpan(4));
+			var t2 = Load128_UInt32(state.AsSpan(8));
+			var t3 = Load128_UInt32(state.AsSpan(12));
+			++state[12];
+			var t4 = Load128_UInt32(state.AsSpan(12));
+			++state[12];
+
+			var x0 = Vector256.Create(t0, t0);
+			var x1 = Vector256.Create(t1, t1);
+			var x2 = Vector256.Create(t2, t2);
+			var x3 = Vector256.Create(t3, t4);
+
+			var v0 = x0;
+			var v1 = x1;
+			var v2 = x2;
+			var v3 = x3;
+
+			for (int i = rounds; i > 0; i -= 2)
+			{
+				v0 = Avx2.Add(v0, v1);
+				v3 = Avx2.Xor(v3, v0);
+				v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
+				v2 = Avx2.Add(v2, v3);
+				v1 = Avx2.Xor(v1, v2);
+				v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
+				v0 = Avx2.Add(v0, v1);
+				v3 = Avx2.Xor(v3, v0);
+				v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
+				v2 = Avx2.Add(v2, v3);
+				v1 = Avx2.Xor(v1, v2);
+				v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
+
+				v1 = Avx2.Shuffle(v1, 0x39);
+				v2 = Avx2.Shuffle(v2, 0x4E);
+				v3 = Avx2.Shuffle(v3, 0x93);
+
+				v0 = Avx2.Add(v0, v1);
+				v3 = Avx2.Xor(v3, v0);
+				v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
+				v2 = Avx2.Add(v2, v3);
+				v1 = Avx2.Xor(v1, v2);
+				v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
+				v0 = Avx2.Add(v0, v1);
+				v3 = Avx2.Xor(v3, v0);
+				v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
+				v2 = Avx2.Add(v2, v3);
+				v1 = Avx2.Xor(v1, v2);
+				v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
+
+				v1 = Avx2.Shuffle(v1, 0x93);
+				v2 = Avx2.Shuffle(v2, 0x4E);
+				v3 = Avx2.Shuffle(v3, 0x39);
+			}
+
+			v0 = Avx2.Add(v0, x0);
+			v1 = Avx2.Add(v1, x1);
+			v2 = Avx2.Add(v2, x2);
+			v3 = Avx2.Add(v3, x3);
+
+			var n0 = Avx2.Permute2x128(v0, v1, 0x20).AsByte();
+			var n1 = Avx2.Permute2x128(v2, v3, 0x20).AsByte();
+			var n2 = Avx2.Permute2x128(v0, v1, 0x31).AsByte();
+			var n3 = Avx2.Permute2x128(v2, v3, 0x31).AsByte();
+
+			n0 = Avx2.Xor(n0, Load256_Byte(input));
+			n1 = Avx2.Xor(n1, Load256_Byte(input.Slice(0x20)));
+			n2 = Avx2.Xor(n2, Load256_Byte(input.Slice(0x40)));
+			n3 = Avx2.Xor(n3, Load256_Byte(input.Slice(0x60)));
+
+			Store256_Byte(ref n0, output);
+			Store256_Byte(ref n1, output.Slice(0x20));
+			Store256_Byte(ref n2, output.Slice(0x40));
+			Store256_Byte(ref n3, output.Slice(0x60));
+		}
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+		internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, Span<byte> input, Span<byte> output)
+		{
+			if (!Sse2.IsSupported)
+				throw new PlatformNotSupportedException();
+
+			Debug.Assert(rounds % 2 == 0);
+			Debug.Assert(state.Length >= 16);
+			Debug.Assert(input.Length >= 128);
+			Debug.Assert(output.Length >= 128);
+
+			var x0 = Load128_UInt32(state.AsSpan());
+			var x1 = Load128_UInt32(state.AsSpan(4));
+			var x2 = Load128_UInt32(state.AsSpan(8));
+			var x3 = Load128_UInt32(state.AsSpan(12));
+			++state[12];
+
+			var v0 = x0;
+			var v1 = x1;
+			var v2 = x2;
+			var v3 = x3;
+
+			for (int i = rounds; i > 0; i -= 2)
+			{
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+				v1 = Sse2.Shuffle(v1, 0x39);
+				v2 = Sse2.Shuffle(v2, 0x4E);
+				v3 = Sse2.Shuffle(v3, 0x93);
+
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+				v1 = Sse2.Shuffle(v1, 0x93);
+				v2 = Sse2.Shuffle(v2, 0x4E);
+				v3 = Sse2.Shuffle(v3, 0x39);
+			}
+
+			v0 = Sse2.Add(v0, x0);
+			v1 = Sse2.Add(v1, x1);
+			v2 = Sse2.Add(v2, x2);
+			v3 = Sse2.Add(v3, x3);
+
+			var n0 = Load128_Byte(input);
+			var n1 = Load128_Byte(input.Slice(0x10));
+			var n2 = Load128_Byte(input.Slice(0x20));
+			var n3 = Load128_Byte(input.Slice(0x30));
+
+			n0 = Sse2.Xor(n0, v0.AsByte());
+			n1 = Sse2.Xor(n1, v1.AsByte());
+			n2 = Sse2.Xor(n2, v2.AsByte());
+			n3 = Sse2.Xor(n3, v3.AsByte());
+
+			Store128_Byte(ref n0, output);
+			Store128_Byte(ref n1, output.Slice(0x10));
+			Store128_Byte(ref n2, output.Slice(0x20));
+			Store128_Byte(ref n3, output.Slice(0x30));
+
+			x3 = Load128_UInt32(state.AsSpan(12));
+			++state[12];
+
+			v0 = x0;
+			v1 = x1;
+			v2 = x2;
+			v3 = x3;
+
+			for (int i = rounds; i > 0; i -= 2)
+			{
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+				v1 = Sse2.Shuffle(v1, 0x39);
+				v2 = Sse2.Shuffle(v2, 0x4E);
+				v3 = Sse2.Shuffle(v3, 0x93);
+
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+				v0 = Sse2.Add(v0, v1);
+				v3 = Sse2.Xor(v3, v0);
+				v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+				v2 = Sse2.Add(v2, v3);
+				v1 = Sse2.Xor(v1, v2);
+				v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+				v1 = Sse2.Shuffle(v1, 0x93);
+				v2 = Sse2.Shuffle(v2, 0x4E);
+				v3 = Sse2.Shuffle(v3, 0x39);
+			}
+
+			v0 = Sse2.Add(v0, x0);
+			v1 = Sse2.Add(v1, x1);
+			v2 = Sse2.Add(v2, x2);
+			v3 = Sse2.Add(v3, x3);
+
+			n0 = Load128_Byte(input.Slice(0x40));
+			n1 = Load128_Byte(input.Slice(0x50));
+			n2 = Load128_Byte(input.Slice(0x60));
+			n3 = Load128_Byte(input.Slice(0x70));
+
+			n0 = Sse2.Xor(n0, v0.AsByte());
+			n1 = Sse2.Xor(n1, v1.AsByte());
+			n2 = Sse2.Xor(n2, v2.AsByte());
+			n3 = Sse2.Xor(n3, v3.AsByte());
+
+			Store128_Byte(ref n0, output.Slice(0x40));
+			Store128_Byte(ref n1, output.Slice(0x50));
+			Store128_Byte(ref n2, output.Slice(0x60));
+			Store128_Byte(ref n3, output.Slice(0x70));
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static Vector128<byte> Load128_Byte(Span<byte> t)
+		{
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+				return Unsafe.ReadUnaligned<Vector128<byte>>(ref t[0]);
+
+			return Vector128.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12],
+				t[13], t[14], t[15]);
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static Vector128<uint> Load128_UInt32(Span<uint> t)
+		{
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+				return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0]));
+
+			return Vector128.Create(t[0], t[1], t[2], t[3]);
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static Vector256<byte> Load256_Byte(Span<byte> t)
+        {
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32)
+				return Unsafe.ReadUnaligned<Vector256<byte>>(ref t[0]);
+
+			return Vector256.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12],
+				t[13], t[14], t[15], t[16], t[17], t[18], t[19], t[20], t[21], t[22], t[23], t[24], t[25], t[26], t[27],
+				t[28], t[29], t[30], t[31]);
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static void Store128_Byte(ref Vector128<byte> s, Span<byte> t)
+		{
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+			{
+				Unsafe.WriteUnaligned(ref t[0], s);
+				return;
+			}
+
+			var u = s.AsUInt64();
+			Pack.UInt64_To_LE(u.GetElement(0), t);
+			Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static void Store256_Byte(ref Vector256<byte> s, Span<byte> t)
+		{
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32)
+			{
+				Unsafe.WriteUnaligned(ref t[0], s);
+				return;
+			}
+
+			var u = s.AsUInt64();
+			Pack.UInt64_To_LE(u.GetElement(0), t);
+			Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
+			Pack.UInt64_To_LE(u.GetElement(2), t.Slice(16));
+			Pack.UInt64_To_LE(u.GetElement(3), t.Slice(24));
+		}
+#endif
+	}
+}
diff --git a/crypto/src/crypto/engines/ChaChaEngine.cs b/crypto/src/crypto/engines/ChaChaEngine.cs
index a97c04e08..a16491ba0 100644
--- a/crypto/src/crypto/engines/ChaChaEngine.cs
+++ b/crypto/src/crypto/engines/ChaChaEngine.cs
@@ -1,4 +1,10 @@
 using System;
+using System.Diagnostics;
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 using Org.BouncyCastle.Crypto.Utilities;
 using Org.BouncyCastle.Utilities;
@@ -65,94 +71,169 @@ namespace Org.BouncyCastle.Crypto.Engines
 
 		protected override void GenerateKeyStream(byte[] output)
 		{
-			ChachaCore(rounds, engineState, x);
-			Pack.UInt32_To_LE(x, output, 0);
+			ChachaCore(rounds, engineState, output);
 		}
 
-		/// <summary>
-		/// ChaCha function.
-		/// </summary>
-		/// <param name="rounds">The number of ChaCha rounds to execute</param>
-		/// <param name="input">The input words.</param>
-		/// <param name="x">The ChaCha state to modify.</param>
-		internal static void ChachaCore(int rounds, uint[] input, uint[] x)
+		internal static void ChachaCore(int rounds, uint[] input, byte[] output)
+		{
+			Debug.Assert(rounds % 2 == 0);
+			Debug.Assert(input.Length >= 16);
+			Debug.Assert(output.Length >= 64);
+
+#if NETCOREAPP3_0_OR_GREATER
+			if (Sse2.IsSupported)
+			{
+				var x0 = Load128_UInt32(input.AsSpan());
+				var x1 = Load128_UInt32(input.AsSpan(4));
+				var x2 = Load128_UInt32(input.AsSpan(8));
+				var x3 = Load128_UInt32(input.AsSpan(12));
+
+				var v0 = x0;
+				var v1 = x1;
+				var v2 = x2;
+				var v3 = x3;
+
+				for (int i = rounds; i > 0; i -= 2)
+				{
+					v0 = Sse2.Add(v0, v1);
+					v3 = Sse2.Xor(v3, v0);
+					v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+					v2 = Sse2.Add(v2, v3);
+					v1 = Sse2.Xor(v1, v2);
+					v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+					v0 = Sse2.Add(v0, v1);
+					v3 = Sse2.Xor(v3, v0);
+					v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+					v2 = Sse2.Add(v2, v3);
+					v1 = Sse2.Xor(v1, v2);
+					v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+					v1 = Sse2.Shuffle(v1, 0x39);
+					v2 = Sse2.Shuffle(v2, 0x4E);
+					v3 = Sse2.Shuffle(v3, 0x93);
+
+					v0 = Sse2.Add(v0, v1);
+					v3 = Sse2.Xor(v3, v0);
+					v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+					v2 = Sse2.Add(v2, v3);
+					v1 = Sse2.Xor(v1, v2);
+					v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+					v0 = Sse2.Add(v0, v1);
+					v3 = Sse2.Xor(v3, v0);
+					v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+					v2 = Sse2.Add(v2, v3);
+					v1 = Sse2.Xor(v1, v2);
+					v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+					v1 = Sse2.Shuffle(v1, 0x93);
+					v2 = Sse2.Shuffle(v2, 0x4E);
+					v3 = Sse2.Shuffle(v3, 0x39);
+				}
+
+				v0 = Sse2.Add(v0, x0);
+				v1 = Sse2.Add(v1, x1);
+				v2 = Sse2.Add(v2, x2);
+				v3 = Sse2.Add(v3, x3);
+
+				Store128_UInt32(ref v0, output.AsSpan());
+				Store128_UInt32(ref v1, output.AsSpan(0x10));
+				Store128_UInt32(ref v2, output.AsSpan(0x20));
+				Store128_UInt32(ref v3, output.AsSpan(0x30));
+				return;
+			}
+#endif
+
+            {
+				uint x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3];
+				uint x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7];
+				uint x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11];
+				uint x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
+
+				for (int i = rounds; i > 0; i -= 2)
+				{
+					x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16);
+					x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16);
+					x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16);
+					x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16);
+
+					x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12);
+					x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12);
+					x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12);
+					x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12);
+
+					x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8);
+					x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8);
+					x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8);
+					x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8);
+
+					x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7);
+					x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7);
+					x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7);
+					x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7);
+
+					x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16);
+					x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16);
+					x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16);
+					x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16);
+
+					x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12);
+					x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12);
+					x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12);
+					x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12);
+
+					x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8);
+					x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8);
+					x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8);
+					x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8);
+
+					x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7);
+					x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7);
+					x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7);
+					x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7);
+				}
+
+				Pack.UInt32_To_LE(x00 + input[ 0], output,  0);
+				Pack.UInt32_To_LE(x01 + input[ 1], output,  4);
+				Pack.UInt32_To_LE(x02 + input[ 2], output,  8);
+				Pack.UInt32_To_LE(x03 + input[ 3], output, 12);
+				Pack.UInt32_To_LE(x04 + input[ 4], output, 16);
+				Pack.UInt32_To_LE(x05 + input[ 5], output, 20);
+				Pack.UInt32_To_LE(x06 + input[ 6], output, 24);
+				Pack.UInt32_To_LE(x07 + input[ 7], output, 28);
+				Pack.UInt32_To_LE(x08 + input[ 8], output, 32);
+				Pack.UInt32_To_LE(x09 + input[ 9], output, 36);
+				Pack.UInt32_To_LE(x10 + input[10], output, 40);
+				Pack.UInt32_To_LE(x11 + input[11], output, 44);
+				Pack.UInt32_To_LE(x12 + input[12], output, 48);
+				Pack.UInt32_To_LE(x13 + input[13], output, 52);
+				Pack.UInt32_To_LE(x14 + input[14], output, 56);
+				Pack.UInt32_To_LE(x15 + input[15], output, 60);
+			}
+		}
+
+#if NETCOREAPP3_0_OR_GREATER
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static Vector128<uint> Load128_UInt32(Span<uint> t)
+		{
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+				return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0]));
+
+			return Vector128.Create(t[0], t[1], t[2], t[3]);
+		}
+
+		[MethodImpl(MethodImplOptions.AggressiveInlining)]
+		private static void Store128_UInt32(ref Vector128<uint> s, Span<byte> t)
 		{
-			if (input.Length != 16)
-				throw new ArgumentException();
-			if (x.Length != 16)
-				throw new ArgumentException();
-			if (rounds % 2 != 0)
-				throw new ArgumentException("Number of rounds must be even");
-
-            uint x00 = input[ 0];
-			uint x01 = input[ 1];
-			uint x02 = input[ 2];
-			uint x03 = input[ 3];
-			uint x04 = input[ 4];
-			uint x05 = input[ 5];
-			uint x06 = input[ 6];
-			uint x07 = input[ 7];
-			uint x08 = input[ 8];
-			uint x09 = input[ 9];
-			uint x10 = input[10];
-			uint x11 = input[11];
-			uint x12 = input[12];
-			uint x13 = input[13];
-			uint x14 = input[14];
-			uint x15 = input[15];
-
-			for (int i = rounds; i > 0; i -= 2)
+			if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
 			{
-				x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16);
-				x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12);
-				x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8);
-				x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7);
-				x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16);
-				x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12);
-				x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8);
-				x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7);
-				x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16);
-				x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12);
-				x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8);
-				x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7);
-				x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16);
-				x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12);
-				x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8);
-				x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7);
-				x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16);
-				x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12);
-				x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8);
-				x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7);
-				x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16);
-				x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12);
-				x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8);
-				x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7);
-				x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16);
-				x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12);
-				x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8);
-				x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7);
-				x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16);
-				x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12);
-				x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8);
-				x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7);
+				Unsafe.WriteUnaligned(ref t[0], s);
+				return;
 			}
 
-			x[ 0] = x00 + input[ 0];
-			x[ 1] = x01 + input[ 1];
-			x[ 2] = x02 + input[ 2];
-			x[ 3] = x03 + input[ 3];
-			x[ 4] = x04 + input[ 4];
-			x[ 5] = x05 + input[ 5];
-			x[ 6] = x06 + input[ 6];
-			x[ 7] = x07 + input[ 7];
-			x[ 8] = x08 + input[ 8];
-			x[ 9] = x09 + input[ 9];
-			x[10] = x10 + input[10];
-			x[11] = x11 + input[11];
-			x[12] = x12 + input[12];
-			x[13] = x13 + input[13];
-			x[14] = x14 + input[14];
-			x[15] = x15 + input[15];
+			var u = s.AsUInt64();
+			Pack.UInt64_To_LE(u.GetElement(0), t);
+			Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
 		}
+#endif
 	}
 }
diff --git a/crypto/src/crypto/engines/Salsa20Engine.cs b/crypto/src/crypto/engines/Salsa20Engine.cs
index a8170d173..77b08f9fc 100644
--- a/crypto/src/crypto/engines/Salsa20Engine.cs
+++ b/crypto/src/crypto/engines/Salsa20Engine.cs
@@ -35,11 +35,11 @@ namespace Org.BouncyCastle.Crypto.Engines
 		 * variables to hold the state of the engine
 		 * during encryption and decryption
 		 */
-		private int		 index = 0;
+		internal int index = 0;
 		internal uint[] engineState = new uint[StateSize]; // state
 		internal uint[] x = new uint[StateSize]; // internal buffer
-		private byte[]	 keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes
-		private bool	 initialised = false;
+		internal byte[] keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes
+		internal bool initialised = false;
 
 		/*
 		 * internal counter
@@ -302,14 +302,14 @@ namespace Org.BouncyCastle.Crypto.Engines
 			x[15] = x15 + input[15];
 		}
 
-		private void ResetLimitCounter()
+		internal void ResetLimitCounter()
 		{
 			cW0 = 0;
 			cW1 = 0;
 			cW2 = 0;
 		}
 
-		private bool LimitExceeded()
+		internal bool LimitExceeded()
 		{
 			if (++cW0 == 0)
 			{
@@ -325,7 +325,7 @@ namespace Org.BouncyCastle.Crypto.Engines
 		/*
 		 * this relies on the fact len will always be positive.
 		 */
-		private bool LimitExceeded(
+		internal bool LimitExceeded(
 			uint len)
 		{
 			uint old = cW0;
diff --git a/crypto/src/crypto/macs/Poly1305.cs b/crypto/src/crypto/macs/Poly1305.cs
index c0a660fac..595d9b051 100644
--- a/crypto/src/crypto/macs/Poly1305.cs
+++ b/crypto/src/crypto/macs/Poly1305.cs
@@ -1,6 +1,9 @@
 using System;
+using System.Diagnostics;
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.CompilerServices;
+#endif
 
-using Org.BouncyCastle.Crypto.Generators;
 using Org.BouncyCastle.Crypto.Parameters;
 using Org.BouncyCastle.Crypto.Utilities;
 
@@ -27,8 +30,6 @@ namespace Org.BouncyCastle.Crypto.Macs
 
         private readonly IBlockCipher cipher;
 
-        private readonly byte[] singleByte = new byte[1];
-
         // Initialised state
 
         /** Polynomial key */
@@ -163,61 +164,79 @@ namespace Org.BouncyCastle.Crypto.Macs
 
         public void Update(byte input)
         {
-            singleByte[0] = input;
-            BlockUpdate(singleByte, 0, 1);
+            currentBlock[currentBlockOffset++] = input;
+            if (currentBlockOffset == BlockSize)
+            {
+                ProcessBlock(currentBlock, 0);
+                currentBlockOffset = 0;
+            }
         }
 
         public void BlockUpdate(byte[] input, int inOff, int len)
         {
-            int copied = 0;
-            while (len > copied)
+            // TODO Validity check on arguments
+
+            int available = BlockSize - currentBlockOffset;
+            if (len < available)
             {
-                if (currentBlockOffset == BlockSize)
-                {
-                    ProcessBlock();
-                    currentBlockOffset = 0;
-                }
+                Array.Copy(input, inOff, currentBlock, currentBlockOffset, len);
+                currentBlockOffset += len;
+                return;
+            }
 
-                int toCopy = System.Math.Min((len - copied), BlockSize - currentBlockOffset);
-                Array.Copy(input, copied + inOff, currentBlock, currentBlockOffset, toCopy);
-                copied += toCopy;
-                currentBlockOffset += toCopy;
+            int pos = 0;
+            if (currentBlockOffset > 0)
+            {
+                Array.Copy(input, inOff, currentBlock, currentBlockOffset, available);
+                pos = available;
+                ProcessBlock(currentBlock, 0);
             }
 
+            int remaining;
+            while ((remaining = len - pos) >= BlockSize)
+            {
+                ProcessBlock(input, inOff + pos);
+                pos += BlockSize;
+            }
+
+            Array.Copy(input, inOff + pos, currentBlock, 0, remaining);
+            currentBlockOffset = remaining;
         }
 
-        private void ProcessBlock()
+        private void ProcessBlock(byte[] buf, int off)
         {
-            if (currentBlockOffset < BlockSize)
+#if NETCOREAPP3_0_OR_GREATER
+            if (BitConverter.IsLittleEndian)
             {
-                currentBlock[currentBlockOffset] = 1;
-                for (int i = currentBlockOffset + 1; i < BlockSize; i++)
-                {
-                    currentBlock[i] = 0;
-                }
+                Span<uint> t = stackalloc uint[4];
+                Unsafe.CopyBlockUnaligned(ref Unsafe.As<uint, byte>(ref t[0]), ref buf[off], 16);
+
+                h0 +=   t[0]                        & 0x3ffffffU;
+                h1 += ((t[1] <<  6) | (t[0] >> 26)) & 0x3ffffffU;
+                h2 += ((t[2] << 12) | (t[1] >> 20)) & 0x3ffffffU;
+                h3 += ((t[3] << 18) | (t[2] >> 14)) & 0x3ffffffU;
+                h4 +=     (1 << 24) | (t[3] >>  8);
             }
-
-            ulong t0 = Pack.LE_To_UInt32(currentBlock, 0);
-            ulong t1 = Pack.LE_To_UInt32(currentBlock, 4);
-            ulong t2 = Pack.LE_To_UInt32(currentBlock, 8);
-            ulong t3 = Pack.LE_To_UInt32(currentBlock, 12);
-
-            h0 += (uint)(t0 & 0x3ffffffU);
-            h1 += (uint)((((t1 << 32) | t0) >> 26) & 0x3ffffff);
-            h2 += (uint)((((t2 << 32) | t1) >> 20) & 0x3ffffff);
-            h3 += (uint)((((t3 << 32) | t2) >> 14) & 0x3ffffff);
-            h4 += (uint)(t3 >> 8);
-
-            if (currentBlockOffset == BlockSize)
+            else
+#endif
             {
-                h4 += (1 << 24);
+                uint t0 = Pack.LE_To_UInt32(buf, off +  0);
+                uint t1 = Pack.LE_To_UInt32(buf, off +  4);
+                uint t2 = Pack.LE_To_UInt32(buf, off +  8);
+                uint t3 = Pack.LE_To_UInt32(buf, off + 12);
+
+                h0 +=   t0                      & 0x3ffffffU;
+                h1 += ((t1 <<  6) | (t0 >> 26)) & 0x3ffffffU;
+                h2 += ((t2 << 12) | (t1 >> 20)) & 0x3ffffffU;
+                h3 += ((t3 << 18) | (t2 >> 14)) & 0x3ffffffU;
+                h4 +=  ( 1 << 24) | (t3 >>  8);
             }
 
-            ulong tp0 = mul32x32_64(h0,r0) + mul32x32_64(h1,s4) + mul32x32_64(h2,s3) + mul32x32_64(h3,s2) + mul32x32_64(h4,s1);
-            ulong tp1 = mul32x32_64(h0,r1) + mul32x32_64(h1,r0) + mul32x32_64(h2,s4) + mul32x32_64(h3,s3) + mul32x32_64(h4,s2);
-            ulong tp2 = mul32x32_64(h0,r2) + mul32x32_64(h1,r1) + mul32x32_64(h2,r0) + mul32x32_64(h3,s4) + mul32x32_64(h4,s3);
-            ulong tp3 = mul32x32_64(h0,r3) + mul32x32_64(h1,r2) + mul32x32_64(h2,r1) + mul32x32_64(h3,r0) + mul32x32_64(h4,s4);
-            ulong tp4 = mul32x32_64(h0,r4) + mul32x32_64(h1,r3) + mul32x32_64(h2,r2) + mul32x32_64(h3,r1) + mul32x32_64(h4,r0);
+            ulong tp0 = (ulong)h0 * r0 + (ulong)h1 * s4 + (ulong)h2 * s3 + (ulong)h3 * s2 + (ulong)h4 * s1;
+            ulong tp1 = (ulong)h0 * r1 + (ulong)h1 * r0 + (ulong)h2 * s4 + (ulong)h3 * s3 + (ulong)h4 * s2;
+            ulong tp2 = (ulong)h0 * r2 + (ulong)h1 * r1 + (ulong)h2 * r0 + (ulong)h3 * s4 + (ulong)h4 * s3;
+            ulong tp3 = (ulong)h0 * r3 + (ulong)h1 * r2 + (ulong)h2 * r1 + (ulong)h3 * r0 + (ulong)h4 * s4;
+            ulong tp4 = (ulong)h0 * r4 + (ulong)h1 * r3 + (ulong)h2 * r2 + (ulong)h3 * r1 + (ulong)h4 * r0;
 
             h0 = (uint)tp0 & 0x3ffffff; tp1 += (tp0 >> 26);
             h1 = (uint)tp1 & 0x3ffffff; tp2 += (tp1 >> 26);
@@ -225,7 +244,7 @@ namespace Org.BouncyCastle.Crypto.Macs
             h3 = (uint)tp3 & 0x3ffffff; tp4 += (tp3 >> 26);
             h4 = (uint)tp4 & 0x3ffffff;
             h0 += (uint)(tp4 >> 26) * 5;
-            h1 += (h0 >> 26); h0 &= 0x3ffffff;
+            h1 += h0 >> 26; h0 &= 0x3ffffff;
         }
 
         public int DoFinal(byte[] output, int outOff)
@@ -235,44 +254,38 @@ namespace Org.BouncyCastle.Crypto.Macs
             if (currentBlockOffset > 0)
             {
                 // Process padded block
-                ProcessBlock();
+                if (currentBlockOffset < BlockSize)
+                {
+                    currentBlock[currentBlockOffset++] = 1;
+                    while (currentBlockOffset < BlockSize)
+                    {
+                        currentBlock[currentBlockOffset++] = 0;
+                    }
+
+                    h4 -= (1 << 24);
+                }
+
+                ProcessBlock(currentBlock, 0);
             }
 
-            h1 += (h0 >> 26); h0 &= 0x3ffffff;
-            h2 += (h1 >> 26); h1 &= 0x3ffffff;
-            h3 += (h2 >> 26); h2 &= 0x3ffffff;
-            h4 += (h3 >> 26); h3 &= 0x3ffffff;
-            h0 += (h4 >> 26) * 5; h4 &= 0x3ffffff;
-            h1 += (h0 >> 26); h0 &= 0x3ffffff;
-
-            uint g0, g1, g2, g3, g4, b;
-            g0 = h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff;
-            g1 = h1 + b; b = g1 >> 26; g1 &= 0x3ffffff;
-            g2 = h2 + b; b = g2 >> 26; g2 &= 0x3ffffff;
-            g3 = h3 + b; b = g3 >> 26; g3 &= 0x3ffffff;
-            g4 = h4 + b - (1 << 26);
-
-            b = (g4 >> 31) - 1;
-            uint nb = ~b;
-            h0 = (h0 & nb) | (g0 & b);
-            h1 = (h1 & nb) | (g1 & b);
-            h2 = (h2 & nb) | (g2 & b);
-            h3 = (h3 & nb) | (g3 & b);
-            h4 = (h4 & nb) | (g4 & b);
-
-            ulong f0, f1, f2, f3;
-            f0 = ((h0      ) | (h1 << 26)) + (ulong)k0;
-            f1 = ((h1 >> 6 ) | (h2 << 20)) + (ulong)k1;
-            f2 = ((h2 >> 12) | (h3 << 14)) + (ulong)k2;
-            f3 = ((h3 >> 18) | (h4 << 8 )) + (ulong)k3;
-
-            Pack.UInt32_To_LE((uint)f0, output, outOff);
-            f1 += (f0 >> 32);
-            Pack.UInt32_To_LE((uint)f1, output, outOff + 4);
-            f2 += (f1 >> 32);
-            Pack.UInt32_To_LE((uint)f2, output, outOff + 8);
-            f3 += (f2 >> 32);
-            Pack.UInt32_To_LE((uint)f3, output, outOff + 12);
+            Debug.Assert(h4 >> 26 == 0);
+
+            //h0 += (h4 >> 26) * 5U + 5U; h4 &= 0x3ffffff;
+            h0 += 5U;
+            h1 += h0 >> 26; h0 &= 0x3ffffff;
+            h2 += h1 >> 26; h1 &= 0x3ffffff;
+            h3 += h2 >> 26; h2 &= 0x3ffffff;
+            h4 += h3 >> 26; h3 &= 0x3ffffff;
+
+            long c = ((int)(h4 >> 26) - 1) * 5;
+            c += (long)k0 + ((h0      ) | (h1 << 26));
+            Pack.UInt32_To_LE((uint)c, output, outOff     ); c >>= 32;
+            c += (long)k1 + ((h1 >>  6) | (h2 << 20));
+            Pack.UInt32_To_LE((uint)c, output, outOff +  4); c >>= 32;
+            c += (long)k2 + ((h2 >> 12) | (h3 << 14));
+            Pack.UInt32_To_LE((uint)c, output, outOff +  8); c >>= 32;
+            c += (long)k3 + ((h3 >> 18) | (h4 << 8));
+            Pack.UInt32_To_LE((uint)c, output, outOff + 12);
 
             Reset();
             return BlockSize;
@@ -284,10 +297,5 @@ namespace Org.BouncyCastle.Crypto.Macs
 
             h0 = h1 = h2 = h3 = h4 = 0;
         }
-
-        private static ulong mul32x32_64(uint i1, uint i2)
-        {
-            return ((ulong)i1) * i2;
-        }
     }
 }
diff --git a/crypto/src/crypto/modes/ChaCha20Poly1305.cs b/crypto/src/crypto/modes/ChaCha20Poly1305.cs
index 6ca32d9c6..462013200 100644
--- a/crypto/src/crypto/modes/ChaCha20Poly1305.cs
+++ b/crypto/src/crypto/modes/ChaCha20Poly1305.cs
@@ -221,7 +221,7 @@ namespace Org.BouncyCastle.Crypto.Modes
                 if (++mBufPos == mBuf.Length)
                 {
                     mPoly1305.BlockUpdate(mBuf, 0, BufSize);
-                    ProcessData(mBuf, 0, BufSize, outBytes, outOff);
+                    ProcessBlock(mBuf, 0, outBytes, outOff);
                     Array.Copy(mBuf, BufSize, mBuf, 0, MacSize);
                     this.mBufPos = MacSize;
                     return BufSize;
@@ -234,7 +234,7 @@ namespace Org.BouncyCastle.Crypto.Modes
                 mBuf[mBufPos] = input;
                 if (++mBufPos == BufSize)
                 {
-                    ProcessData(mBuf, 0, BufSize, outBytes, outOff);
+                    ProcessBlock(mBuf, 0, outBytes, outOff);
                     mPoly1305.BlockUpdate(outBytes, outOff, BufSize);
                     this.mBufPos = 0;
                     return BufSize;
@@ -275,53 +275,99 @@ namespace Org.BouncyCastle.Crypto.Modes
             {
             case State.DecData:
             {
-                for (int i = 0; i < len; ++i)
+                int available = mBuf.Length - mBufPos;
+                if (len < available)
                 {
-                    mBuf[mBufPos] = inBytes[inOff + i];
-                    if (++mBufPos == mBuf.Length)
+                    Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+                    mBufPos += len;
+                    break;
+                }
+
+                if (mBufPos >= BufSize)
+                {
+                    mPoly1305.BlockUpdate(mBuf, 0, BufSize);
+                    ProcessBlock(mBuf, 0, outBytes, outOff);
+                    Array.Copy(mBuf, BufSize, mBuf, 0, mBufPos -= BufSize);
+                    resultLen = BufSize;
+
+                    available += BufSize;
+                    if (len < available)
                     {
-                        mPoly1305.BlockUpdate(mBuf, 0, BufSize);
-                        ProcessData(mBuf, 0, BufSize, outBytes, outOff + resultLen);
-                        Array.Copy(mBuf, BufSize, mBuf, 0, MacSize);
-                        this.mBufPos = MacSize;
-                        resultLen += BufSize;
+                        Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+                        mBufPos += len;
+                        break;
                     }
                 }
+
+                int inLimit1 = inOff + len - mBuf.Length;
+                int inLimit2 = inLimit1 - BufSize;
+
+                available = BufSize - mBufPos;
+                Array.Copy(inBytes, inOff, mBuf, mBufPos, available);
+                mPoly1305.BlockUpdate(mBuf, 0, BufSize);
+                ProcessBlock(mBuf, 0, outBytes, outOff + resultLen);
+                inOff += available;
+                resultLen += BufSize;
+
+                while (inOff <= inLimit2)
+                {
+                    mPoly1305.BlockUpdate(inBytes, inOff, BufSize * 2);
+                    ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen);
+                    inOff += BufSize * 2;
+                    resultLen += BufSize * 2;
+                }
+
+                if (inOff <= inLimit1)
+                {
+                    mPoly1305.BlockUpdate(inBytes, inOff, BufSize);
+                    ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen);
+                    inOff += BufSize;
+                    resultLen += BufSize;
+                }
+
+                mBufPos = mBuf.Length + inLimit1 - inOff;
+                Array.Copy(inBytes, inOff, mBuf, 0, mBufPos);
                 break;
             }
             case State.EncData:
             {
-                if (mBufPos != 0)
+                int available = BufSize - mBufPos;
+                if (len < available)
                 {
-                    while (len > 0)
-                    {
-                        --len;
-                        mBuf[mBufPos] = inBytes[inOff++];
-                        if (++mBufPos == BufSize)
-                        {
-                            ProcessData(mBuf, 0, BufSize, outBytes, outOff);
-                            mPoly1305.BlockUpdate(outBytes, outOff, BufSize);
-                            this.mBufPos = 0;
-                            resultLen = BufSize;
-                            break;
-                        }
-                    }
+                    Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+                    mBufPos += len;
+                    break;
                 }
 
-                while (len >= BufSize)
+                int inLimit1 = inOff + len - BufSize;
+                int inLimit2 = inLimit1 - BufSize;
+
+                if (mBufPos > 0)
                 {
-                    ProcessData(inBytes, inOff, BufSize, outBytes, outOff + resultLen);
-                    mPoly1305.BlockUpdate(outBytes, outOff + resultLen, BufSize);
-                    inOff += BufSize;
-                    len -= BufSize;
-                    resultLen += BufSize;
+                    Array.Copy(inBytes, inOff, mBuf, mBufPos, available);
+                    ProcessBlock(mBuf, 0, outBytes, outOff);
+                    inOff += available;
+                    resultLen = BufSize;
+                }
+
+                while (inOff <= inLimit2)
+                {
+                    ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen);
+                    inOff += BufSize * 2;
+                    resultLen += BufSize * 2;
                 }
 
-                if (len > 0)
+                if (inOff <= inLimit1)
                 {
-                    Array.Copy(inBytes, inOff, mBuf, 0, len);
-                    this.mBufPos = len;
+                    ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen);
+                    inOff += BufSize;
+                    resultLen += BufSize;
                 }
+
+                mPoly1305.BlockUpdate(outBytes, outOff, resultLen);
+
+                mBufPos = BufSize + inLimit1 - inOff;
+                Array.Copy(inBytes, inOff, mBuf, 0, mBufPos);
                 break;
             }
             default:
@@ -500,6 +546,24 @@ namespace Org.BouncyCastle.Crypto.Modes
             }
         }
 
+        private void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+        {
+            Check.OutputLength(outBytes, outOff, 64, "output buffer too short");
+
+            mChacha20.ProcessBlock(inBytes, inOff, outBytes, outOff);
+
+            this.mDataCount = IncrementCount(mDataCount, 64U, DataLimit);
+        }
+
+        private void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+        {
+            Check.OutputLength(outBytes, outOff, 128, "output buffer too short");
+
+            mChacha20.ProcessBlocks2(inBytes, inOff, outBytes, outOff);
+
+            this.mDataCount = IncrementCount(mDataCount, 128U, DataLimit);
+        }
+
         private void ProcessData(byte[] inBytes, int inOff, int inLen, byte[] outBytes, int outOff)
         {
             Check.OutputLength(outBytes, outOff, inLen, "output buffer too short");
diff --git a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
index 8d801ed7a..ab78d0ce2 100644
--- a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
+++ b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
@@ -30,7 +30,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC
             {
                 int ciphertextLength = inputLength;
 
-                m_cipher.ProcessBytes(input, inputOffset, inputLength, output, outputOffset);
+                m_cipher.DoFinal(input, inputOffset, inputLength, output, outputOffset);
                 int outputLength = inputLength;
 
                 if (ciphertextLength != outputLength)
@@ -63,7 +63,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC
                 if (badMac)
                     throw new TlsFatalAlert(AlertDescription.bad_record_mac);
 
-                m_cipher.ProcessBytes(input, inputOffset, ciphertextLength, output, outputOffset);
+                m_cipher.DoFinal(input, inputOffset, ciphertextLength, output, outputOffset);
                 int outputLength = ciphertextLength;
 
                 if (ciphertextLength != outputLength)