diff --git a/crypto/src/crypto/engines/ChaCha7539Engine.cs b/crypto/src/crypto/engines/ChaCha7539Engine.cs
index 206416a98..81e97478b 100644
--- a/crypto/src/crypto/engines/ChaCha7539Engine.cs
+++ b/crypto/src/crypto/engines/ChaCha7539Engine.cs
@@ -1,4 +1,12 @@
using System;
+using System.Diagnostics;
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+using System.Runtime.CompilerServices;
+#endif
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using Org.BouncyCastle.Crypto.Utilities;
@@ -58,9 +66,398 @@ namespace Org.BouncyCastle.Crypto.Engines
protected override void GenerateKeyStream(byte[] output)
{
- ChaChaEngine.ChachaCore(rounds, engineState, x);
- Pack.UInt32_To_LE(x, output, 0);
+ ChaChaEngine.ChachaCore(rounds, engineState, output);
}
- }
-}
+ internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff)
+ {
+ if (!initialised)
+ throw new InvalidOperationException(AlgorithmName + " not initialised");
+ if (index != 0)
+ throw new InvalidOperationException(AlgorithmName + " not in block-aligned state");
+
+ Check.DataLength(inBuf, inOff, inLen, "input buffer too short");
+ Check.OutputLength(outBuf, outOff, inLen, "output buffer too short");
+
+ while (inLen >= 128)
+ {
+ ProcessBlocks2(inBuf, inOff, outBuf, outOff);
+ inOff += 128;
+ inLen -= 128;
+ outOff += 128;
+ }
+
+ if (inLen >= 64)
+ {
+ ImplProcessBlock(inBuf, inOff, outBuf, outOff);
+ inOff += 64;
+ inLen -= 64;
+ outOff += 64;
+ }
+
+ if (inLen > 0)
+ {
+ GenerateKeyStream(keyStream);
+ AdvanceCounter();
+
+ for (int i = 0; i < inLen; ++i)
+ {
+ outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]);
+ }
+ }
+
+ engineState[12] = 0;
+
+ // TODO Prevent re-use if encrypting
+ }
+
+ internal void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+ {
+ if (!initialised)
+ throw new InvalidOperationException(AlgorithmName + " not initialised");
+ if (LimitExceeded(64U))
+ throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
+
+ Debug.Assert(index == 0);
+
+ ImplProcessBlock(inBytes, inOff, outBytes, outOff);
+ }
+
+ internal void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+ {
+ if (!initialised)
+ throw new InvalidOperationException(AlgorithmName + " not initialised");
+ if (LimitExceeded(128U))
+ throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
+
+ Debug.Assert(index == 0);
+
+#if NETCOREAPP3_0_OR_GREATER
+ if (Avx2.IsSupported)
+ {
+ ImplProcessBlocks2_X86_Avx2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff));
+ return;
+ }
+
+ if (Sse2.IsSupported)
+ {
+ ImplProcessBlocks2_X86_Sse2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff));
+ return;
+ }
+#endif
+
+ {
+ ImplProcessBlock(inBytes, inOff, outBytes, outOff);
+ ImplProcessBlock(inBytes, inOff + 64, outBytes, outOff + 64);
+ }
+ }
+
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+#endif
+ internal void ImplProcessBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
+ {
+ ChaChaEngine.ChachaCore(rounds, engineState, keyStream);
+ AdvanceCounter();
+
+ for (int i = 0; i < 64; ++i)
+ {
+ outBuf[outOff + i] = (byte)(keyStream[i] ^ inBuf[inOff + i]);
+ }
+ }
+
+#if NETCOREAPP3_0_OR_GREATER
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, Span<byte> input, Span<byte> output)
+ {
+ if (!Avx2.IsSupported)
+ throw new PlatformNotSupportedException();
+
+ Debug.Assert(rounds % 2 == 0);
+ Debug.Assert(state.Length >= 16);
+ Debug.Assert(input.Length >= 128);
+ Debug.Assert(output.Length >= 128);
+
+ var t0 = Load128_UInt32(state.AsSpan());
+ var t1 = Load128_UInt32(state.AsSpan(4));
+ var t2 = Load128_UInt32(state.AsSpan(8));
+ var t3 = Load128_UInt32(state.AsSpan(12));
+ ++state[12];
+ var t4 = Load128_UInt32(state.AsSpan(12));
+ ++state[12];
+
+ var x0 = Vector256.Create(t0, t0);
+ var x1 = Vector256.Create(t1, t1);
+ var x2 = Vector256.Create(t2, t2);
+ var x3 = Vector256.Create(t3, t4);
+
+ var v0 = x0;
+ var v1 = x1;
+ var v2 = x2;
+ var v3 = x3;
+
+ for (int i = rounds; i > 0; i -= 2)
+ {
+ v0 = Avx2.Add(v0, v1);
+ v3 = Avx2.Xor(v3, v0);
+ v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
+ v2 = Avx2.Add(v2, v3);
+ v1 = Avx2.Xor(v1, v2);
+ v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
+ v0 = Avx2.Add(v0, v1);
+ v3 = Avx2.Xor(v3, v0);
+ v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
+ v2 = Avx2.Add(v2, v3);
+ v1 = Avx2.Xor(v1, v2);
+ v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
+
+ v1 = Avx2.Shuffle(v1, 0x39);
+ v2 = Avx2.Shuffle(v2, 0x4E);
+ v3 = Avx2.Shuffle(v3, 0x93);
+
+ v0 = Avx2.Add(v0, v1);
+ v3 = Avx2.Xor(v3, v0);
+ v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
+ v2 = Avx2.Add(v2, v3);
+ v1 = Avx2.Xor(v1, v2);
+ v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
+ v0 = Avx2.Add(v0, v1);
+ v3 = Avx2.Xor(v3, v0);
+ v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
+ v2 = Avx2.Add(v2, v3);
+ v1 = Avx2.Xor(v1, v2);
+ v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
+
+ v1 = Avx2.Shuffle(v1, 0x93);
+ v2 = Avx2.Shuffle(v2, 0x4E);
+ v3 = Avx2.Shuffle(v3, 0x39);
+ }
+
+ v0 = Avx2.Add(v0, x0);
+ v1 = Avx2.Add(v1, x1);
+ v2 = Avx2.Add(v2, x2);
+ v3 = Avx2.Add(v3, x3);
+
+ var n0 = Avx2.Permute2x128(v0, v1, 0x20).AsByte();
+ var n1 = Avx2.Permute2x128(v2, v3, 0x20).AsByte();
+ var n2 = Avx2.Permute2x128(v0, v1, 0x31).AsByte();
+ var n3 = Avx2.Permute2x128(v2, v3, 0x31).AsByte();
+
+ n0 = Avx2.Xor(n0, Load256_Byte(input));
+ n1 = Avx2.Xor(n1, Load256_Byte(input.Slice(0x20)));
+ n2 = Avx2.Xor(n2, Load256_Byte(input.Slice(0x40)));
+ n3 = Avx2.Xor(n3, Load256_Byte(input.Slice(0x60)));
+
+ Store256_Byte(ref n0, output);
+ Store256_Byte(ref n1, output.Slice(0x20));
+ Store256_Byte(ref n2, output.Slice(0x40));
+ Store256_Byte(ref n3, output.Slice(0x60));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, Span<byte> input, Span<byte> output)
+ {
+ if (!Sse2.IsSupported)
+ throw new PlatformNotSupportedException();
+
+ Debug.Assert(rounds % 2 == 0);
+ Debug.Assert(state.Length >= 16);
+ Debug.Assert(input.Length >= 128);
+ Debug.Assert(output.Length >= 128);
+
+ var x0 = Load128_UInt32(state.AsSpan());
+ var x1 = Load128_UInt32(state.AsSpan(4));
+ var x2 = Load128_UInt32(state.AsSpan(8));
+ var x3 = Load128_UInt32(state.AsSpan(12));
+ ++state[12];
+
+ var v0 = x0;
+ var v1 = x1;
+ var v2 = x2;
+ var v3 = x3;
+
+ for (int i = rounds; i > 0; i -= 2)
+ {
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x39);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x93);
+
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x93);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x39);
+ }
+
+ v0 = Sse2.Add(v0, x0);
+ v1 = Sse2.Add(v1, x1);
+ v2 = Sse2.Add(v2, x2);
+ v3 = Sse2.Add(v3, x3);
+
+ var n0 = Load128_Byte(input);
+ var n1 = Load128_Byte(input.Slice(0x10));
+ var n2 = Load128_Byte(input.Slice(0x20));
+ var n3 = Load128_Byte(input.Slice(0x30));
+
+ n0 = Sse2.Xor(n0, v0.AsByte());
+ n1 = Sse2.Xor(n1, v1.AsByte());
+ n2 = Sse2.Xor(n2, v2.AsByte());
+ n3 = Sse2.Xor(n3, v3.AsByte());
+
+ Store128_Byte(ref n0, output);
+ Store128_Byte(ref n1, output.Slice(0x10));
+ Store128_Byte(ref n2, output.Slice(0x20));
+ Store128_Byte(ref n3, output.Slice(0x30));
+
+ x3 = Load128_UInt32(state.AsSpan(12));
+ ++state[12];
+
+ v0 = x0;
+ v1 = x1;
+ v2 = x2;
+ v3 = x3;
+
+ for (int i = rounds; i > 0; i -= 2)
+ {
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x39);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x93);
+
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x93);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x39);
+ }
+
+ v0 = Sse2.Add(v0, x0);
+ v1 = Sse2.Add(v1, x1);
+ v2 = Sse2.Add(v2, x2);
+ v3 = Sse2.Add(v3, x3);
+
+ n0 = Load128_Byte(input.Slice(0x40));
+ n1 = Load128_Byte(input.Slice(0x50));
+ n2 = Load128_Byte(input.Slice(0x60));
+ n3 = Load128_Byte(input.Slice(0x70));
+
+ n0 = Sse2.Xor(n0, v0.AsByte());
+ n1 = Sse2.Xor(n1, v1.AsByte());
+ n2 = Sse2.Xor(n2, v2.AsByte());
+ n3 = Sse2.Xor(n3, v3.AsByte());
+
+ Store128_Byte(ref n0, output.Slice(0x40));
+ Store128_Byte(ref n1, output.Slice(0x50));
+ Store128_Byte(ref n2, output.Slice(0x60));
+ Store128_Byte(ref n3, output.Slice(0x70));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<byte> Load128_Byte(Span<byte> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+ return Unsafe.ReadUnaligned<Vector128<byte>>(ref t[0]);
+
+ return Vector128.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12],
+ t[13], t[14], t[15]);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<uint> Load128_UInt32(Span<uint> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+ return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0]));
+
+ return Vector128.Create(t[0], t[1], t[2], t[3]);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector256<byte> Load256_Byte(Span<byte> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32)
+ return Unsafe.ReadUnaligned<Vector256<byte>>(ref t[0]);
+
+ return Vector256.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12],
+ t[13], t[14], t[15], t[16], t[17], t[18], t[19], t[20], t[21], t[22], t[23], t[24], t[25], t[26], t[27],
+ t[28], t[29], t[30], t[31]);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void Store128_Byte(ref Vector128<byte> s, Span<byte> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+ {
+ Unsafe.WriteUnaligned(ref t[0], s);
+ return;
+ }
+
+ var u = s.AsUInt64();
+ Pack.UInt64_To_LE(u.GetElement(0), t);
+ Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void Store256_Byte(ref Vector256<byte> s, Span<byte> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32)
+ {
+ Unsafe.WriteUnaligned(ref t[0], s);
+ return;
+ }
+
+ var u = s.AsUInt64();
+ Pack.UInt64_To_LE(u.GetElement(0), t);
+ Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
+ Pack.UInt64_To_LE(u.GetElement(2), t.Slice(16));
+ Pack.UInt64_To_LE(u.GetElement(3), t.Slice(24));
+ }
+#endif
+ }
+}
diff --git a/crypto/src/crypto/engines/ChaChaEngine.cs b/crypto/src/crypto/engines/ChaChaEngine.cs
index a97c04e08..a16491ba0 100644
--- a/crypto/src/crypto/engines/ChaChaEngine.cs
+++ b/crypto/src/crypto/engines/ChaChaEngine.cs
@@ -1,4 +1,10 @@
using System;
+using System.Diagnostics;
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using Org.BouncyCastle.Crypto.Utilities;
using Org.BouncyCastle.Utilities;
@@ -65,94 +71,169 @@ namespace Org.BouncyCastle.Crypto.Engines
protected override void GenerateKeyStream(byte[] output)
{
- ChachaCore(rounds, engineState, x);
- Pack.UInt32_To_LE(x, output, 0);
+ ChachaCore(rounds, engineState, output);
}
- /// <summary>
- /// ChaCha function.
- /// </summary>
- /// <param name="rounds">The number of ChaCha rounds to execute</param>
- /// <param name="input">The input words.</param>
- /// <param name="x">The ChaCha state to modify.</param>
- internal static void ChachaCore(int rounds, uint[] input, uint[] x)
+ internal static void ChachaCore(int rounds, uint[] input, byte[] output)
+ {
+ Debug.Assert(rounds % 2 == 0);
+ Debug.Assert(input.Length >= 16);
+ Debug.Assert(output.Length >= 64);
+
+#if NETCOREAPP3_0_OR_GREATER
+ if (Sse2.IsSupported)
+ {
+ var x0 = Load128_UInt32(input.AsSpan());
+ var x1 = Load128_UInt32(input.AsSpan(4));
+ var x2 = Load128_UInt32(input.AsSpan(8));
+ var x3 = Load128_UInt32(input.AsSpan(12));
+
+ var v0 = x0;
+ var v1 = x1;
+ var v2 = x2;
+ var v3 = x3;
+
+ for (int i = rounds; i > 0; i -= 2)
+ {
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x39);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x93);
+
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
+ v0 = Sse2.Add(v0, v1);
+ v3 = Sse2.Xor(v3, v0);
+ v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
+ v2 = Sse2.Add(v2, v3);
+ v1 = Sse2.Xor(v1, v2);
+ v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
+
+ v1 = Sse2.Shuffle(v1, 0x93);
+ v2 = Sse2.Shuffle(v2, 0x4E);
+ v3 = Sse2.Shuffle(v3, 0x39);
+ }
+
+ v0 = Sse2.Add(v0, x0);
+ v1 = Sse2.Add(v1, x1);
+ v2 = Sse2.Add(v2, x2);
+ v3 = Sse2.Add(v3, x3);
+
+ Store128_UInt32(ref v0, output.AsSpan());
+ Store128_UInt32(ref v1, output.AsSpan(0x10));
+ Store128_UInt32(ref v2, output.AsSpan(0x20));
+ Store128_UInt32(ref v3, output.AsSpan(0x30));
+ return;
+ }
+#endif
+
+ {
+ uint x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3];
+ uint x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7];
+ uint x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11];
+ uint x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15];
+
+ for (int i = rounds; i > 0; i -= 2)
+ {
+ x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16);
+ x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16);
+ x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16);
+ x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16);
+
+ x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12);
+ x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12);
+ x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12);
+ x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12);
+
+ x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8);
+ x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8);
+ x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8);
+ x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8);
+
+ x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7);
+ x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7);
+ x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7);
+ x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7);
+
+ x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16);
+ x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16);
+ x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16);
+ x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16);
+
+ x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12);
+ x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12);
+ x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12);
+ x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12);
+
+ x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8);
+ x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8);
+ x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8);
+ x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8);
+
+ x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7);
+ x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7);
+ x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7);
+ x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7);
+ }
+
+ Pack.UInt32_To_LE(x00 + input[ 0], output, 0);
+ Pack.UInt32_To_LE(x01 + input[ 1], output, 4);
+ Pack.UInt32_To_LE(x02 + input[ 2], output, 8);
+ Pack.UInt32_To_LE(x03 + input[ 3], output, 12);
+ Pack.UInt32_To_LE(x04 + input[ 4], output, 16);
+ Pack.UInt32_To_LE(x05 + input[ 5], output, 20);
+ Pack.UInt32_To_LE(x06 + input[ 6], output, 24);
+ Pack.UInt32_To_LE(x07 + input[ 7], output, 28);
+ Pack.UInt32_To_LE(x08 + input[ 8], output, 32);
+ Pack.UInt32_To_LE(x09 + input[ 9], output, 36);
+ Pack.UInt32_To_LE(x10 + input[10], output, 40);
+ Pack.UInt32_To_LE(x11 + input[11], output, 44);
+ Pack.UInt32_To_LE(x12 + input[12], output, 48);
+ Pack.UInt32_To_LE(x13 + input[13], output, 52);
+ Pack.UInt32_To_LE(x14 + input[14], output, 56);
+ Pack.UInt32_To_LE(x15 + input[15], output, 60);
+ }
+ }
+
+#if NETCOREAPP3_0_OR_GREATER
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<uint> Load128_UInt32(Span<uint> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+ return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0]));
+
+ return Vector128.Create(t[0], t[1], t[2], t[3]);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void Store128_UInt32(ref Vector128<uint> s, Span<byte> t)
{
- if (input.Length != 16)
- throw new ArgumentException();
- if (x.Length != 16)
- throw new ArgumentException();
- if (rounds % 2 != 0)
- throw new ArgumentException("Number of rounds must be even");
-
- uint x00 = input[ 0];
- uint x01 = input[ 1];
- uint x02 = input[ 2];
- uint x03 = input[ 3];
- uint x04 = input[ 4];
- uint x05 = input[ 5];
- uint x06 = input[ 6];
- uint x07 = input[ 7];
- uint x08 = input[ 8];
- uint x09 = input[ 9];
- uint x10 = input[10];
- uint x11 = input[11];
- uint x12 = input[12];
- uint x13 = input[13];
- uint x14 = input[14];
- uint x15 = input[15];
-
- for (int i = rounds; i > 0; i -= 2)
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
{
- x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16);
- x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12);
- x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8);
- x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7);
- x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16);
- x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12);
- x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8);
- x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7);
- x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16);
- x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12);
- x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8);
- x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7);
- x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16);
- x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12);
- x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8);
- x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7);
- x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16);
- x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12);
- x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8);
- x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7);
- x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16);
- x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12);
- x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8);
- x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7);
- x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16);
- x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12);
- x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8);
- x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7);
- x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16);
- x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12);
- x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8);
- x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7);
+ Unsafe.WriteUnaligned(ref t[0], s);
+ return;
}
- x[ 0] = x00 + input[ 0];
- x[ 1] = x01 + input[ 1];
- x[ 2] = x02 + input[ 2];
- x[ 3] = x03 + input[ 3];
- x[ 4] = x04 + input[ 4];
- x[ 5] = x05 + input[ 5];
- x[ 6] = x06 + input[ 6];
- x[ 7] = x07 + input[ 7];
- x[ 8] = x08 + input[ 8];
- x[ 9] = x09 + input[ 9];
- x[10] = x10 + input[10];
- x[11] = x11 + input[11];
- x[12] = x12 + input[12];
- x[13] = x13 + input[13];
- x[14] = x14 + input[14];
- x[15] = x15 + input[15];
+ var u = s.AsUInt64();
+ Pack.UInt64_To_LE(u.GetElement(0), t);
+ Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8));
}
+#endif
}
}
diff --git a/crypto/src/crypto/engines/Salsa20Engine.cs b/crypto/src/crypto/engines/Salsa20Engine.cs
index a8170d173..77b08f9fc 100644
--- a/crypto/src/crypto/engines/Salsa20Engine.cs
+++ b/crypto/src/crypto/engines/Salsa20Engine.cs
@@ -35,11 +35,11 @@ namespace Org.BouncyCastle.Crypto.Engines
* variables to hold the state of the engine
* during encryption and decryption
*/
- private int index = 0;
+ internal int index = 0;
internal uint[] engineState = new uint[StateSize]; // state
internal uint[] x = new uint[StateSize]; // internal buffer
- private byte[] keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes
- private bool initialised = false;
+ internal byte[] keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes
+ internal bool initialised = false;
/*
* internal counter
@@ -302,14 +302,14 @@ namespace Org.BouncyCastle.Crypto.Engines
x[15] = x15 + input[15];
}
- private void ResetLimitCounter()
+ internal void ResetLimitCounter()
{
cW0 = 0;
cW1 = 0;
cW2 = 0;
}
- private bool LimitExceeded()
+ internal bool LimitExceeded()
{
if (++cW0 == 0)
{
@@ -325,7 +325,7 @@ namespace Org.BouncyCastle.Crypto.Engines
/*
* this relies on the fact len will always be positive.
*/
- private bool LimitExceeded(
+ internal bool LimitExceeded(
uint len)
{
uint old = cW0;
diff --git a/crypto/src/crypto/macs/Poly1305.cs b/crypto/src/crypto/macs/Poly1305.cs
index c0a660fac..595d9b051 100644
--- a/crypto/src/crypto/macs/Poly1305.cs
+++ b/crypto/src/crypto/macs/Poly1305.cs
@@ -1,6 +1,9 @@
using System;
+using System.Diagnostics;
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.CompilerServices;
+#endif
-using Org.BouncyCastle.Crypto.Generators;
using Org.BouncyCastle.Crypto.Parameters;
using Org.BouncyCastle.Crypto.Utilities;
@@ -27,8 +30,6 @@ namespace Org.BouncyCastle.Crypto.Macs
private readonly IBlockCipher cipher;
- private readonly byte[] singleByte = new byte[1];
-
// Initialised state
/** Polynomial key */
@@ -163,61 +164,79 @@ namespace Org.BouncyCastle.Crypto.Macs
public void Update(byte input)
{
- singleByte[0] = input;
- BlockUpdate(singleByte, 0, 1);
+ currentBlock[currentBlockOffset++] = input;
+ if (currentBlockOffset == BlockSize)
+ {
+ ProcessBlock(currentBlock, 0);
+ currentBlockOffset = 0;
+ }
}
public void BlockUpdate(byte[] input, int inOff, int len)
{
- int copied = 0;
- while (len > copied)
+ // TODO Validity check on arguments
+
+ int available = BlockSize - currentBlockOffset;
+ if (len < available)
{
- if (currentBlockOffset == BlockSize)
- {
- ProcessBlock();
- currentBlockOffset = 0;
- }
+ Array.Copy(input, inOff, currentBlock, currentBlockOffset, len);
+ currentBlockOffset += len;
+ return;
+ }
- int toCopy = System.Math.Min((len - copied), BlockSize - currentBlockOffset);
- Array.Copy(input, copied + inOff, currentBlock, currentBlockOffset, toCopy);
- copied += toCopy;
- currentBlockOffset += toCopy;
+ int pos = 0;
+ if (currentBlockOffset > 0)
+ {
+ Array.Copy(input, inOff, currentBlock, currentBlockOffset, available);
+ pos = available;
+ ProcessBlock(currentBlock, 0);
}
+ int remaining;
+ while ((remaining = len - pos) >= BlockSize)
+ {
+ ProcessBlock(input, inOff + pos);
+ pos += BlockSize;
+ }
+
+ Array.Copy(input, inOff + pos, currentBlock, 0, remaining);
+ currentBlockOffset = remaining;
}
- private void ProcessBlock()
+ private void ProcessBlock(byte[] buf, int off)
{
- if (currentBlockOffset < BlockSize)
+#if NETCOREAPP3_0_OR_GREATER
+ if (BitConverter.IsLittleEndian)
{
- currentBlock[currentBlockOffset] = 1;
- for (int i = currentBlockOffset + 1; i < BlockSize; i++)
- {
- currentBlock[i] = 0;
- }
+ Span<uint> t = stackalloc uint[4];
+ Unsafe.CopyBlockUnaligned(ref Unsafe.As<uint, byte>(ref t[0]), ref buf[off], 16);
+
+ h0 += t[0] & 0x3ffffffU;
+ h1 += ((t[1] << 6) | (t[0] >> 26)) & 0x3ffffffU;
+ h2 += ((t[2] << 12) | (t[1] >> 20)) & 0x3ffffffU;
+ h3 += ((t[3] << 18) | (t[2] >> 14)) & 0x3ffffffU;
+ h4 += (1 << 24) | (t[3] >> 8);
}
-
- ulong t0 = Pack.LE_To_UInt32(currentBlock, 0);
- ulong t1 = Pack.LE_To_UInt32(currentBlock, 4);
- ulong t2 = Pack.LE_To_UInt32(currentBlock, 8);
- ulong t3 = Pack.LE_To_UInt32(currentBlock, 12);
-
- h0 += (uint)(t0 & 0x3ffffffU);
- h1 += (uint)((((t1 << 32) | t0) >> 26) & 0x3ffffff);
- h2 += (uint)((((t2 << 32) | t1) >> 20) & 0x3ffffff);
- h3 += (uint)((((t3 << 32) | t2) >> 14) & 0x3ffffff);
- h4 += (uint)(t3 >> 8);
-
- if (currentBlockOffset == BlockSize)
+ else
+#endif
{
- h4 += (1 << 24);
+ uint t0 = Pack.LE_To_UInt32(buf, off + 0);
+ uint t1 = Pack.LE_To_UInt32(buf, off + 4);
+ uint t2 = Pack.LE_To_UInt32(buf, off + 8);
+ uint t3 = Pack.LE_To_UInt32(buf, off + 12);
+
+ h0 += t0 & 0x3ffffffU;
+ h1 += ((t1 << 6) | (t0 >> 26)) & 0x3ffffffU;
+ h2 += ((t2 << 12) | (t1 >> 20)) & 0x3ffffffU;
+ h3 += ((t3 << 18) | (t2 >> 14)) & 0x3ffffffU;
+ h4 += ( 1 << 24) | (t3 >> 8);
}
- ulong tp0 = mul32x32_64(h0,r0) + mul32x32_64(h1,s4) + mul32x32_64(h2,s3) + mul32x32_64(h3,s2) + mul32x32_64(h4,s1);
- ulong tp1 = mul32x32_64(h0,r1) + mul32x32_64(h1,r0) + mul32x32_64(h2,s4) + mul32x32_64(h3,s3) + mul32x32_64(h4,s2);
- ulong tp2 = mul32x32_64(h0,r2) + mul32x32_64(h1,r1) + mul32x32_64(h2,r0) + mul32x32_64(h3,s4) + mul32x32_64(h4,s3);
- ulong tp3 = mul32x32_64(h0,r3) + mul32x32_64(h1,r2) + mul32x32_64(h2,r1) + mul32x32_64(h3,r0) + mul32x32_64(h4,s4);
- ulong tp4 = mul32x32_64(h0,r4) + mul32x32_64(h1,r3) + mul32x32_64(h2,r2) + mul32x32_64(h3,r1) + mul32x32_64(h4,r0);
+ ulong tp0 = (ulong)h0 * r0 + (ulong)h1 * s4 + (ulong)h2 * s3 + (ulong)h3 * s2 + (ulong)h4 * s1;
+ ulong tp1 = (ulong)h0 * r1 + (ulong)h1 * r0 + (ulong)h2 * s4 + (ulong)h3 * s3 + (ulong)h4 * s2;
+ ulong tp2 = (ulong)h0 * r2 + (ulong)h1 * r1 + (ulong)h2 * r0 + (ulong)h3 * s4 + (ulong)h4 * s3;
+ ulong tp3 = (ulong)h0 * r3 + (ulong)h1 * r2 + (ulong)h2 * r1 + (ulong)h3 * r0 + (ulong)h4 * s4;
+ ulong tp4 = (ulong)h0 * r4 + (ulong)h1 * r3 + (ulong)h2 * r2 + (ulong)h3 * r1 + (ulong)h4 * r0;
h0 = (uint)tp0 & 0x3ffffff; tp1 += (tp0 >> 26);
h1 = (uint)tp1 & 0x3ffffff; tp2 += (tp1 >> 26);
@@ -225,7 +244,7 @@ namespace Org.BouncyCastle.Crypto.Macs
h3 = (uint)tp3 & 0x3ffffff; tp4 += (tp3 >> 26);
h4 = (uint)tp4 & 0x3ffffff;
h0 += (uint)(tp4 >> 26) * 5;
- h1 += (h0 >> 26); h0 &= 0x3ffffff;
+ h1 += h0 >> 26; h0 &= 0x3ffffff;
}
public int DoFinal(byte[] output, int outOff)
@@ -235,44 +254,38 @@ namespace Org.BouncyCastle.Crypto.Macs
if (currentBlockOffset > 0)
{
// Process padded block
- ProcessBlock();
+ if (currentBlockOffset < BlockSize)
+ {
+ currentBlock[currentBlockOffset++] = 1;
+ while (currentBlockOffset < BlockSize)
+ {
+ currentBlock[currentBlockOffset++] = 0;
+ }
+
+ h4 -= (1 << 24);
+ }
+
+ ProcessBlock(currentBlock, 0);
}
- h1 += (h0 >> 26); h0 &= 0x3ffffff;
- h2 += (h1 >> 26); h1 &= 0x3ffffff;
- h3 += (h2 >> 26); h2 &= 0x3ffffff;
- h4 += (h3 >> 26); h3 &= 0x3ffffff;
- h0 += (h4 >> 26) * 5; h4 &= 0x3ffffff;
- h1 += (h0 >> 26); h0 &= 0x3ffffff;
-
- uint g0, g1, g2, g3, g4, b;
- g0 = h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff;
- g1 = h1 + b; b = g1 >> 26; g1 &= 0x3ffffff;
- g2 = h2 + b; b = g2 >> 26; g2 &= 0x3ffffff;
- g3 = h3 + b; b = g3 >> 26; g3 &= 0x3ffffff;
- g4 = h4 + b - (1 << 26);
-
- b = (g4 >> 31) - 1;
- uint nb = ~b;
- h0 = (h0 & nb) | (g0 & b);
- h1 = (h1 & nb) | (g1 & b);
- h2 = (h2 & nb) | (g2 & b);
- h3 = (h3 & nb) | (g3 & b);
- h4 = (h4 & nb) | (g4 & b);
-
- ulong f0, f1, f2, f3;
- f0 = ((h0 ) | (h1 << 26)) + (ulong)k0;
- f1 = ((h1 >> 6 ) | (h2 << 20)) + (ulong)k1;
- f2 = ((h2 >> 12) | (h3 << 14)) + (ulong)k2;
- f3 = ((h3 >> 18) | (h4 << 8 )) + (ulong)k3;
-
- Pack.UInt32_To_LE((uint)f0, output, outOff);
- f1 += (f0 >> 32);
- Pack.UInt32_To_LE((uint)f1, output, outOff + 4);
- f2 += (f1 >> 32);
- Pack.UInt32_To_LE((uint)f2, output, outOff + 8);
- f3 += (f2 >> 32);
- Pack.UInt32_To_LE((uint)f3, output, outOff + 12);
+ Debug.Assert(h4 >> 26 == 0);
+
+ //h0 += (h4 >> 26) * 5U + 5U; h4 &= 0x3ffffff;
+ h0 += 5U;
+ h1 += h0 >> 26; h0 &= 0x3ffffff;
+ h2 += h1 >> 26; h1 &= 0x3ffffff;
+ h3 += h2 >> 26; h2 &= 0x3ffffff;
+ h4 += h3 >> 26; h3 &= 0x3ffffff;
+
+ long c = ((int)(h4 >> 26) - 1) * 5;
+ c += (long)k0 + ((h0 ) | (h1 << 26));
+ Pack.UInt32_To_LE((uint)c, output, outOff ); c >>= 32;
+ c += (long)k1 + ((h1 >> 6) | (h2 << 20));
+ Pack.UInt32_To_LE((uint)c, output, outOff + 4); c >>= 32;
+ c += (long)k2 + ((h2 >> 12) | (h3 << 14));
+ Pack.UInt32_To_LE((uint)c, output, outOff + 8); c >>= 32;
+ c += (long)k3 + ((h3 >> 18) | (h4 << 8));
+ Pack.UInt32_To_LE((uint)c, output, outOff + 12);
Reset();
return BlockSize;
@@ -284,10 +297,5 @@ namespace Org.BouncyCastle.Crypto.Macs
h0 = h1 = h2 = h3 = h4 = 0;
}
-
- private static ulong mul32x32_64(uint i1, uint i2)
- {
- return ((ulong)i1) * i2;
- }
}
}
diff --git a/crypto/src/crypto/modes/ChaCha20Poly1305.cs b/crypto/src/crypto/modes/ChaCha20Poly1305.cs
index 6ca32d9c6..462013200 100644
--- a/crypto/src/crypto/modes/ChaCha20Poly1305.cs
+++ b/crypto/src/crypto/modes/ChaCha20Poly1305.cs
@@ -221,7 +221,7 @@ namespace Org.BouncyCastle.Crypto.Modes
if (++mBufPos == mBuf.Length)
{
mPoly1305.BlockUpdate(mBuf, 0, BufSize);
- ProcessData(mBuf, 0, BufSize, outBytes, outOff);
+ ProcessBlock(mBuf, 0, outBytes, outOff);
Array.Copy(mBuf, BufSize, mBuf, 0, MacSize);
this.mBufPos = MacSize;
return BufSize;
@@ -234,7 +234,7 @@ namespace Org.BouncyCastle.Crypto.Modes
mBuf[mBufPos] = input;
if (++mBufPos == BufSize)
{
- ProcessData(mBuf, 0, BufSize, outBytes, outOff);
+ ProcessBlock(mBuf, 0, outBytes, outOff);
mPoly1305.BlockUpdate(outBytes, outOff, BufSize);
this.mBufPos = 0;
return BufSize;
@@ -275,53 +275,99 @@ namespace Org.BouncyCastle.Crypto.Modes
{
case State.DecData:
{
- for (int i = 0; i < len; ++i)
+ int available = mBuf.Length - mBufPos;
+ if (len < available)
{
- mBuf[mBufPos] = inBytes[inOff + i];
- if (++mBufPos == mBuf.Length)
+ Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+ mBufPos += len;
+ break;
+ }
+
+ if (mBufPos >= BufSize)
+ {
+ mPoly1305.BlockUpdate(mBuf, 0, BufSize);
+ ProcessBlock(mBuf, 0, outBytes, outOff);
+ Array.Copy(mBuf, BufSize, mBuf, 0, mBufPos -= BufSize);
+ resultLen = BufSize;
+
+ available += BufSize;
+ if (len < available)
{
- mPoly1305.BlockUpdate(mBuf, 0, BufSize);
- ProcessData(mBuf, 0, BufSize, outBytes, outOff + resultLen);
- Array.Copy(mBuf, BufSize, mBuf, 0, MacSize);
- this.mBufPos = MacSize;
- resultLen += BufSize;
+ Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+ mBufPos += len;
+ break;
}
}
+
+ int inLimit1 = inOff + len - mBuf.Length;
+ int inLimit2 = inLimit1 - BufSize;
+
+ available = BufSize - mBufPos;
+ Array.Copy(inBytes, inOff, mBuf, mBufPos, available);
+ mPoly1305.BlockUpdate(mBuf, 0, BufSize);
+ ProcessBlock(mBuf, 0, outBytes, outOff + resultLen);
+ inOff += available;
+ resultLen += BufSize;
+
+ while (inOff <= inLimit2)
+ {
+ mPoly1305.BlockUpdate(inBytes, inOff, BufSize * 2);
+ ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen);
+ inOff += BufSize * 2;
+ resultLen += BufSize * 2;
+ }
+
+ if (inOff <= inLimit1)
+ {
+ mPoly1305.BlockUpdate(inBytes, inOff, BufSize);
+ ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen);
+ inOff += BufSize;
+ resultLen += BufSize;
+ }
+
+ mBufPos = mBuf.Length + inLimit1 - inOff;
+ Array.Copy(inBytes, inOff, mBuf, 0, mBufPos);
break;
}
case State.EncData:
{
- if (mBufPos != 0)
+ int available = BufSize - mBufPos;
+ if (len < available)
{
- while (len > 0)
- {
- --len;
- mBuf[mBufPos] = inBytes[inOff++];
- if (++mBufPos == BufSize)
- {
- ProcessData(mBuf, 0, BufSize, outBytes, outOff);
- mPoly1305.BlockUpdate(outBytes, outOff, BufSize);
- this.mBufPos = 0;
- resultLen = BufSize;
- break;
- }
- }
+ Array.Copy(inBytes, inOff, mBuf, mBufPos, len);
+ mBufPos += len;
+ break;
}
- while (len >= BufSize)
+ int inLimit1 = inOff + len - BufSize;
+ int inLimit2 = inLimit1 - BufSize;
+
+ if (mBufPos > 0)
{
- ProcessData(inBytes, inOff, BufSize, outBytes, outOff + resultLen);
- mPoly1305.BlockUpdate(outBytes, outOff + resultLen, BufSize);
- inOff += BufSize;
- len -= BufSize;
- resultLen += BufSize;
+ Array.Copy(inBytes, inOff, mBuf, mBufPos, available);
+ ProcessBlock(mBuf, 0, outBytes, outOff);
+ inOff += available;
+ resultLen = BufSize;
+ }
+
+ while (inOff <= inLimit2)
+ {
+ ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen);
+ inOff += BufSize * 2;
+ resultLen += BufSize * 2;
}
- if (len > 0)
+ if (inOff <= inLimit1)
{
- Array.Copy(inBytes, inOff, mBuf, 0, len);
- this.mBufPos = len;
+ ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen);
+ inOff += BufSize;
+ resultLen += BufSize;
}
+
+ mPoly1305.BlockUpdate(outBytes, outOff, resultLen);
+
+ mBufPos = BufSize + inLimit1 - inOff;
+ Array.Copy(inBytes, inOff, mBuf, 0, mBufPos);
break;
}
default:
@@ -500,6 +546,24 @@ namespace Org.BouncyCastle.Crypto.Modes
}
}
+ private void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+ {
+ Check.OutputLength(outBytes, outOff, 64, "output buffer too short");
+
+ mChacha20.ProcessBlock(inBytes, inOff, outBytes, outOff);
+
+ this.mDataCount = IncrementCount(mDataCount, 64U, DataLimit);
+ }
+
+ private void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
+ {
+ Check.OutputLength(outBytes, outOff, 128, "output buffer too short");
+
+ mChacha20.ProcessBlocks2(inBytes, inOff, outBytes, outOff);
+
+ this.mDataCount = IncrementCount(mDataCount, 128U, DataLimit);
+ }
+
private void ProcessData(byte[] inBytes, int inOff, int inLen, byte[] outBytes, int outOff)
{
Check.OutputLength(outBytes, outOff, inLen, "output buffer too short");
diff --git a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
index 8d801ed7a..ab78d0ce2 100644
--- a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
+++ b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs
@@ -30,7 +30,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC
{
int ciphertextLength = inputLength;
- m_cipher.ProcessBytes(input, inputOffset, inputLength, output, outputOffset);
+ m_cipher.DoFinal(input, inputOffset, inputLength, output, outputOffset);
int outputLength = inputLength;
if (ciphertextLength != outputLength)
@@ -63,7 +63,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC
if (badMac)
throw new TlsFatalAlert(AlertDescription.bad_record_mac);
- m_cipher.ProcessBytes(input, inputOffset, ciphertextLength, output, outputOffset);
+ m_cipher.DoFinal(input, inputOffset, ciphertextLength, output, outputOffset);
int outputLength = ciphertextLength;
if (ciphertextLength != outputLength)
|