diff options
-rw-r--r-- | crypto/Contributors.html | 3 | ||||
-rw-r--r-- | crypto/src/crypto/engines/ChaCha7539Engine.cs | 405 | ||||
-rw-r--r-- | crypto/src/crypto/engines/ChaChaEngine.cs | 245 | ||||
-rw-r--r-- | crypto/src/crypto/engines/Salsa20Engine.cs | 12 | ||||
-rw-r--r-- | crypto/src/crypto/macs/Poly1305.cs | 176 | ||||
-rw-r--r-- | crypto/src/crypto/modes/ChaCha20Poly1305.cs | 130 | ||||
-rw-r--r-- | crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs | 4 |
7 files changed, 763 insertions, 212 deletions
diff --git a/crypto/Contributors.html b/crypto/Contributors.html index 388b023fd..1a7479003 100644 --- a/crypto/Contributors.html +++ b/crypto/Contributors.html @@ -36,7 +36,8 @@ <p>Contributors - Organisations.<p> <ul> <li> -Monash University, Cyber Security Lab, under the supervision of A. Prof. Ron Steinfeld, Dr. Amin Sakzad, and Dr. Raymond K. Zhao for contributions to the NIST post-quantum algorithm set. Initial NTRU Prime implementation: Yuki Kume. +Monash University, Cyber Security Lab, under the supervision of A. Prof. Ron Steinfeld, Dr. Amin Sakzad, and Dr. Raymond K. Zhao +for contributions to the NIST post-quantum algorithm set. Initial NTRU Prime implementation: Yuki Kume. </li> </ul> <p>Contributors - People</p> diff --git a/crypto/src/crypto/engines/ChaCha7539Engine.cs b/crypto/src/crypto/engines/ChaCha7539Engine.cs index 206416a98..81e97478b 100644 --- a/crypto/src/crypto/engines/ChaCha7539Engine.cs +++ b/crypto/src/crypto/engines/ChaCha7539Engine.cs @@ -1,4 +1,12 @@ using System; +using System.Diagnostics; +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER +using System.Runtime.CompilerServices; +#endif +#if NETCOREAPP3_0_OR_GREATER +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Crypto.Utilities; @@ -58,9 +66,398 @@ namespace Org.BouncyCastle.Crypto.Engines protected override void GenerateKeyStream(byte[] output) { - ChaChaEngine.ChachaCore(rounds, engineState, x); - Pack.UInt32_To_LE(x, output, 0); + ChaChaEngine.ChachaCore(rounds, engineState, output); } - } -} + internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff) + { + if (!initialised) + throw new InvalidOperationException(AlgorithmName + " not initialised"); + if (index != 0) + throw new InvalidOperationException(AlgorithmName + " not in block-aligned state"); + + Check.DataLength(inBuf, inOff, inLen, "input buffer too short"); + Check.OutputLength(outBuf, outOff, inLen, "output buffer too short"); + + while (inLen >= 128) + { + ProcessBlocks2(inBuf, inOff, outBuf, outOff); + inOff += 128; + inLen -= 128; + outOff += 128; + } + + if (inLen >= 64) + { + ImplProcessBlock(inBuf, inOff, outBuf, outOff); + inOff += 64; + inLen -= 64; + outOff += 64; + } + + if (inLen > 0) + { + GenerateKeyStream(keyStream); + AdvanceCounter(); + + for (int i = 0; i < inLen; ++i) + { + outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]); + } + } + + engineState[12] = 0; + + // TODO Prevent re-use if encrypting + } + + internal void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff) + { + if (!initialised) + throw new InvalidOperationException(AlgorithmName + " not initialised"); + if (LimitExceeded(64U)) + throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV"); + + Debug.Assert(index == 0); + + ImplProcessBlock(inBytes, inOff, outBytes, outOff); + } + + internal void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff) + { + if (!initialised) + throw new InvalidOperationException(AlgorithmName + " not initialised"); + if (LimitExceeded(128U)) + throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV"); + + Debug.Assert(index == 0); + +#if NETCOREAPP3_0_OR_GREATER + if (Avx2.IsSupported) + { + ImplProcessBlocks2_X86_Avx2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff)); + return; + } + + if (Sse2.IsSupported) + { + ImplProcessBlocks2_X86_Sse2(rounds, engineState, inBytes.AsSpan(inOff), outBytes.AsSpan(outOff)); + return; + } +#endif + + { + ImplProcessBlock(inBytes, inOff, outBytes, outOff); + ImplProcessBlock(inBytes, inOff + 64, outBytes, outOff + 64); + } + } + +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] +#endif + internal void ImplProcessBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff) + { + ChaChaEngine.ChachaCore(rounds, engineState, keyStream); + AdvanceCounter(); + + for (int i = 0; i < 64; ++i) + { + outBuf[outOff + i] = (byte)(keyStream[i] ^ inBuf[inOff + i]); + } + } + +#if NETCOREAPP3_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, Span<byte> input, Span<byte> output) + { + if (!Avx2.IsSupported) + throw new PlatformNotSupportedException(); + + Debug.Assert(rounds % 2 == 0); + Debug.Assert(state.Length >= 16); + Debug.Assert(input.Length >= 128); + Debug.Assert(output.Length >= 128); + + var t0 = Load128_UInt32(state.AsSpan()); + var t1 = Load128_UInt32(state.AsSpan(4)); + var t2 = Load128_UInt32(state.AsSpan(8)); + var t3 = Load128_UInt32(state.AsSpan(12)); + ++state[12]; + var t4 = Load128_UInt32(state.AsSpan(12)); + ++state[12]; + + var x0 = Vector256.Create(t0, t0); + var x1 = Vector256.Create(t1, t1); + var x2 = Vector256.Create(t2, t2); + var x3 = Vector256.Create(t3, t4); + + var v0 = x0; + var v1 = x1; + var v2 = x2; + var v3 = x3; + + for (int i = rounds; i > 0; i -= 2) + { + v0 = Avx2.Add(v0, v1); + v3 = Avx2.Xor(v3, v0); + v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16)); + v2 = Avx2.Add(v2, v3); + v1 = Avx2.Xor(v1, v2); + v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20)); + v0 = Avx2.Add(v0, v1); + v3 = Avx2.Xor(v3, v0); + v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24)); + v2 = Avx2.Add(v2, v3); + v1 = Avx2.Xor(v1, v2); + v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25)); + + v1 = Avx2.Shuffle(v1, 0x39); + v2 = Avx2.Shuffle(v2, 0x4E); + v3 = Avx2.Shuffle(v3, 0x93); + + v0 = Avx2.Add(v0, v1); + v3 = Avx2.Xor(v3, v0); + v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16)); + v2 = Avx2.Add(v2, v3); + v1 = Avx2.Xor(v1, v2); + v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20)); + v0 = Avx2.Add(v0, v1); + v3 = Avx2.Xor(v3, v0); + v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24)); + v2 = Avx2.Add(v2, v3); + v1 = Avx2.Xor(v1, v2); + v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25)); + + v1 = Avx2.Shuffle(v1, 0x93); + v2 = Avx2.Shuffle(v2, 0x4E); + v3 = Avx2.Shuffle(v3, 0x39); + } + + v0 = Avx2.Add(v0, x0); + v1 = Avx2.Add(v1, x1); + v2 = Avx2.Add(v2, x2); + v3 = Avx2.Add(v3, x3); + + var n0 = Avx2.Permute2x128(v0, v1, 0x20).AsByte(); + var n1 = Avx2.Permute2x128(v2, v3, 0x20).AsByte(); + var n2 = Avx2.Permute2x128(v0, v1, 0x31).AsByte(); + var n3 = Avx2.Permute2x128(v2, v3, 0x31).AsByte(); + + n0 = Avx2.Xor(n0, Load256_Byte(input)); + n1 = Avx2.Xor(n1, Load256_Byte(input.Slice(0x20))); + n2 = Avx2.Xor(n2, Load256_Byte(input.Slice(0x40))); + n3 = Avx2.Xor(n3, Load256_Byte(input.Slice(0x60))); + + Store256_Byte(ref n0, output); + Store256_Byte(ref n1, output.Slice(0x20)); + Store256_Byte(ref n2, output.Slice(0x40)); + Store256_Byte(ref n3, output.Slice(0x60)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, Span<byte> input, Span<byte> output) + { + if (!Sse2.IsSupported) + throw new PlatformNotSupportedException(); + + Debug.Assert(rounds % 2 == 0); + Debug.Assert(state.Length >= 16); + Debug.Assert(input.Length >= 128); + Debug.Assert(output.Length >= 128); + + var x0 = Load128_UInt32(state.AsSpan()); + var x1 = Load128_UInt32(state.AsSpan(4)); + var x2 = Load128_UInt32(state.AsSpan(8)); + var x3 = Load128_UInt32(state.AsSpan(12)); + ++state[12]; + + var v0 = x0; + var v1 = x1; + var v2 = x2; + var v3 = x3; + + for (int i = rounds; i > 0; i -= 2) + { + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x39); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x93); + + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x93); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x39); + } + + v0 = Sse2.Add(v0, x0); + v1 = Sse2.Add(v1, x1); + v2 = Sse2.Add(v2, x2); + v3 = Sse2.Add(v3, x3); + + var n0 = Load128_Byte(input); + var n1 = Load128_Byte(input.Slice(0x10)); + var n2 = Load128_Byte(input.Slice(0x20)); + var n3 = Load128_Byte(input.Slice(0x30)); + + n0 = Sse2.Xor(n0, v0.AsByte()); + n1 = Sse2.Xor(n1, v1.AsByte()); + n2 = Sse2.Xor(n2, v2.AsByte()); + n3 = Sse2.Xor(n3, v3.AsByte()); + + Store128_Byte(ref n0, output); + Store128_Byte(ref n1, output.Slice(0x10)); + Store128_Byte(ref n2, output.Slice(0x20)); + Store128_Byte(ref n3, output.Slice(0x30)); + + x3 = Load128_UInt32(state.AsSpan(12)); + ++state[12]; + + v0 = x0; + v1 = x1; + v2 = x2; + v3 = x3; + + for (int i = rounds; i > 0; i -= 2) + { + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x39); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x93); + + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x93); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x39); + } + + v0 = Sse2.Add(v0, x0); + v1 = Sse2.Add(v1, x1); + v2 = Sse2.Add(v2, x2); + v3 = Sse2.Add(v3, x3); + + n0 = Load128_Byte(input.Slice(0x40)); + n1 = Load128_Byte(input.Slice(0x50)); + n2 = Load128_Byte(input.Slice(0x60)); + n3 = Load128_Byte(input.Slice(0x70)); + + n0 = Sse2.Xor(n0, v0.AsByte()); + n1 = Sse2.Xor(n1, v1.AsByte()); + n2 = Sse2.Xor(n2, v2.AsByte()); + n3 = Sse2.Xor(n3, v3.AsByte()); + + Store128_Byte(ref n0, output.Slice(0x40)); + Store128_Byte(ref n1, output.Slice(0x50)); + Store128_Byte(ref n2, output.Slice(0x60)); + Store128_Byte(ref n3, output.Slice(0x70)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<byte> Load128_Byte(Span<byte> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16) + return Unsafe.ReadUnaligned<Vector128<byte>>(ref t[0]); + + return Vector128.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], + t[13], t[14], t[15]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<uint> Load128_UInt32(Span<uint> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16) + return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0])); + + return Vector128.Create(t[0], t[1], t[2], t[3]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256<byte> Load256_Byte(Span<byte> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32) + return Unsafe.ReadUnaligned<Vector256<byte>>(ref t[0]); + + return Vector256.Create(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], + t[13], t[14], t[15], t[16], t[17], t[18], t[19], t[20], t[21], t[22], t[23], t[24], t[25], t[26], t[27], + t[28], t[29], t[30], t[31]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store128_Byte(ref Vector128<byte> s, Span<byte> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16) + { + Unsafe.WriteUnaligned(ref t[0], s); + return; + } + + var u = s.AsUInt64(); + Pack.UInt64_To_LE(u.GetElement(0), t); + Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store256_Byte(ref Vector256<byte> s, Span<byte> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector256<byte>>() == 32) + { + Unsafe.WriteUnaligned(ref t[0], s); + return; + } + + var u = s.AsUInt64(); + Pack.UInt64_To_LE(u.GetElement(0), t); + Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8)); + Pack.UInt64_To_LE(u.GetElement(2), t.Slice(16)); + Pack.UInt64_To_LE(u.GetElement(3), t.Slice(24)); + } +#endif + } +} diff --git a/crypto/src/crypto/engines/ChaChaEngine.cs b/crypto/src/crypto/engines/ChaChaEngine.cs index a97c04e08..a16491ba0 100644 --- a/crypto/src/crypto/engines/ChaChaEngine.cs +++ b/crypto/src/crypto/engines/ChaChaEngine.cs @@ -1,4 +1,10 @@ using System; +using System.Diagnostics; +#if NETCOREAPP3_0_OR_GREATER +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Utilities; @@ -65,94 +71,169 @@ namespace Org.BouncyCastle.Crypto.Engines protected override void GenerateKeyStream(byte[] output) { - ChachaCore(rounds, engineState, x); - Pack.UInt32_To_LE(x, output, 0); + ChachaCore(rounds, engineState, output); } - /// <summary> - /// ChaCha function. - /// </summary> - /// <param name="rounds">The number of ChaCha rounds to execute</param> - /// <param name="input">The input words.</param> - /// <param name="x">The ChaCha state to modify.</param> - internal static void ChachaCore(int rounds, uint[] input, uint[] x) + internal static void ChachaCore(int rounds, uint[] input, byte[] output) + { + Debug.Assert(rounds % 2 == 0); + Debug.Assert(input.Length >= 16); + Debug.Assert(output.Length >= 64); + +#if NETCOREAPP3_0_OR_GREATER + if (Sse2.IsSupported) + { + var x0 = Load128_UInt32(input.AsSpan()); + var x1 = Load128_UInt32(input.AsSpan(4)); + var x2 = Load128_UInt32(input.AsSpan(8)); + var x3 = Load128_UInt32(input.AsSpan(12)); + + var v0 = x0; + var v1 = x1; + var v2 = x2; + var v3 = x3; + + for (int i = rounds; i > 0; i -= 2) + { + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x39); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x93); + + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20)); + v0 = Sse2.Add(v0, v1); + v3 = Sse2.Xor(v3, v0); + v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24)); + v2 = Sse2.Add(v2, v3); + v1 = Sse2.Xor(v1, v2); + v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25)); + + v1 = Sse2.Shuffle(v1, 0x93); + v2 = Sse2.Shuffle(v2, 0x4E); + v3 = Sse2.Shuffle(v3, 0x39); + } + + v0 = Sse2.Add(v0, x0); + v1 = Sse2.Add(v1, x1); + v2 = Sse2.Add(v2, x2); + v3 = Sse2.Add(v3, x3); + + Store128_UInt32(ref v0, output.AsSpan()); + Store128_UInt32(ref v1, output.AsSpan(0x10)); + Store128_UInt32(ref v2, output.AsSpan(0x20)); + Store128_UInt32(ref v3, output.AsSpan(0x30)); + return; + } +#endif + + { + uint x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3]; + uint x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7]; + uint x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11]; + uint x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; + + for (int i = rounds; i > 0; i -= 2) + { + x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16); + x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16); + x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16); + x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16); + + x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12); + x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12); + x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12); + x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12); + + x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8); + x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8); + x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8); + x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8); + + x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7); + x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7); + x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7); + x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7); + + x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16); + x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16); + x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16); + x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16); + + x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12); + x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12); + x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12); + x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12); + + x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8); + x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8); + x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8); + x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8); + + x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7); + x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7); + x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7); + x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7); + } + + Pack.UInt32_To_LE(x00 + input[ 0], output, 0); + Pack.UInt32_To_LE(x01 + input[ 1], output, 4); + Pack.UInt32_To_LE(x02 + input[ 2], output, 8); + Pack.UInt32_To_LE(x03 + input[ 3], output, 12); + Pack.UInt32_To_LE(x04 + input[ 4], output, 16); + Pack.UInt32_To_LE(x05 + input[ 5], output, 20); + Pack.UInt32_To_LE(x06 + input[ 6], output, 24); + Pack.UInt32_To_LE(x07 + input[ 7], output, 28); + Pack.UInt32_To_LE(x08 + input[ 8], output, 32); + Pack.UInt32_To_LE(x09 + input[ 9], output, 36); + Pack.UInt32_To_LE(x10 + input[10], output, 40); + Pack.UInt32_To_LE(x11 + input[11], output, 44); + Pack.UInt32_To_LE(x12 + input[12], output, 48); + Pack.UInt32_To_LE(x13 + input[13], output, 52); + Pack.UInt32_To_LE(x14 + input[14], output, 56); + Pack.UInt32_To_LE(x15 + input[15], output, 60); + } + } + +#if NETCOREAPP3_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<uint> Load128_UInt32(Span<uint> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16) + return Unsafe.ReadUnaligned<Vector128<uint>>(ref Unsafe.As<uint, byte>(ref t[0])); + + return Vector128.Create(t[0], t[1], t[2], t[3]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store128_UInt32(ref Vector128<uint> s, Span<byte> t) { - if (input.Length != 16) - throw new ArgumentException(); - if (x.Length != 16) - throw new ArgumentException(); - if (rounds % 2 != 0) - throw new ArgumentException("Number of rounds must be even"); - - uint x00 = input[ 0]; - uint x01 = input[ 1]; - uint x02 = input[ 2]; - uint x03 = input[ 3]; - uint x04 = input[ 4]; - uint x05 = input[ 5]; - uint x06 = input[ 6]; - uint x07 = input[ 7]; - uint x08 = input[ 8]; - uint x09 = input[ 9]; - uint x10 = input[10]; - uint x11 = input[11]; - uint x12 = input[12]; - uint x13 = input[13]; - uint x14 = input[14]; - uint x15 = input[15]; - - for (int i = rounds; i > 0; i -= 2) + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16) { - x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 16); - x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 12); - x00 += x04; x12 = Integers.RotateLeft(x12 ^ x00, 8); - x08 += x12; x04 = Integers.RotateLeft(x04 ^ x08, 7); - x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 16); - x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 12); - x01 += x05; x13 = Integers.RotateLeft(x13 ^ x01, 8); - x09 += x13; x05 = Integers.RotateLeft(x05 ^ x09, 7); - x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 16); - x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 12); - x02 += x06; x14 = Integers.RotateLeft(x14 ^ x02, 8); - x10 += x14; x06 = Integers.RotateLeft(x06 ^ x10, 7); - x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 16); - x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 12); - x03 += x07; x15 = Integers.RotateLeft(x15 ^ x03, 8); - x11 += x15; x07 = Integers.RotateLeft(x07 ^ x11, 7); - x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 16); - x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 12); - x00 += x05; x15 = Integers.RotateLeft(x15 ^ x00, 8); - x10 += x15; x05 = Integers.RotateLeft(x05 ^ x10, 7); - x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 16); - x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 12); - x01 += x06; x12 = Integers.RotateLeft(x12 ^ x01, 8); - x11 += x12; x06 = Integers.RotateLeft(x06 ^ x11, 7); - x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 16); - x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 12); - x02 += x07; x13 = Integers.RotateLeft(x13 ^ x02, 8); - x08 += x13; x07 = Integers.RotateLeft(x07 ^ x08, 7); - x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 16); - x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 12); - x03 += x04; x14 = Integers.RotateLeft(x14 ^ x03, 8); - x09 += x14; x04 = Integers.RotateLeft(x04 ^ x09, 7); + Unsafe.WriteUnaligned(ref t[0], s); + return; } - x[ 0] = x00 + input[ 0]; - x[ 1] = x01 + input[ 1]; - x[ 2] = x02 + input[ 2]; - x[ 3] = x03 + input[ 3]; - x[ 4] = x04 + input[ 4]; - x[ 5] = x05 + input[ 5]; - x[ 6] = x06 + input[ 6]; - x[ 7] = x07 + input[ 7]; - x[ 8] = x08 + input[ 8]; - x[ 9] = x09 + input[ 9]; - x[10] = x10 + input[10]; - x[11] = x11 + input[11]; - x[12] = x12 + input[12]; - x[13] = x13 + input[13]; - x[14] = x14 + input[14]; - x[15] = x15 + input[15]; + var u = s.AsUInt64(); + Pack.UInt64_To_LE(u.GetElement(0), t); + Pack.UInt64_To_LE(u.GetElement(1), t.Slice(8)); } +#endif } } diff --git a/crypto/src/crypto/engines/Salsa20Engine.cs b/crypto/src/crypto/engines/Salsa20Engine.cs index a8170d173..77b08f9fc 100644 --- a/crypto/src/crypto/engines/Salsa20Engine.cs +++ b/crypto/src/crypto/engines/Salsa20Engine.cs @@ -35,11 +35,11 @@ namespace Org.BouncyCastle.Crypto.Engines * variables to hold the state of the engine * during encryption and decryption */ - private int index = 0; + internal int index = 0; internal uint[] engineState = new uint[StateSize]; // state internal uint[] x = new uint[StateSize]; // internal buffer - private byte[] keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes - private bool initialised = false; + internal byte[] keyStream = new byte[StateSize * 4]; // expanded state, 64 bytes + internal bool initialised = false; /* * internal counter @@ -302,14 +302,14 @@ namespace Org.BouncyCastle.Crypto.Engines x[15] = x15 + input[15]; } - private void ResetLimitCounter() + internal void ResetLimitCounter() { cW0 = 0; cW1 = 0; cW2 = 0; } - private bool LimitExceeded() + internal bool LimitExceeded() { if (++cW0 == 0) { @@ -325,7 +325,7 @@ namespace Org.BouncyCastle.Crypto.Engines /* * this relies on the fact len will always be positive. */ - private bool LimitExceeded( + internal bool LimitExceeded( uint len) { uint old = cW0; diff --git a/crypto/src/crypto/macs/Poly1305.cs b/crypto/src/crypto/macs/Poly1305.cs index c0a660fac..595d9b051 100644 --- a/crypto/src/crypto/macs/Poly1305.cs +++ b/crypto/src/crypto/macs/Poly1305.cs @@ -1,6 +1,9 @@ using System; +using System.Diagnostics; +#if NETCOREAPP3_0_OR_GREATER +using System.Runtime.CompilerServices; +#endif -using Org.BouncyCastle.Crypto.Generators; using Org.BouncyCastle.Crypto.Parameters; using Org.BouncyCastle.Crypto.Utilities; @@ -27,8 +30,6 @@ namespace Org.BouncyCastle.Crypto.Macs private readonly IBlockCipher cipher; - private readonly byte[] singleByte = new byte[1]; - // Initialised state /** Polynomial key */ @@ -163,61 +164,79 @@ namespace Org.BouncyCastle.Crypto.Macs public void Update(byte input) { - singleByte[0] = input; - BlockUpdate(singleByte, 0, 1); + currentBlock[currentBlockOffset++] = input; + if (currentBlockOffset == BlockSize) + { + ProcessBlock(currentBlock, 0); + currentBlockOffset = 0; + } } public void BlockUpdate(byte[] input, int inOff, int len) { - int copied = 0; - while (len > copied) + // TODO Validity check on arguments + + int available = BlockSize - currentBlockOffset; + if (len < available) { - if (currentBlockOffset == BlockSize) - { - ProcessBlock(); - currentBlockOffset = 0; - } + Array.Copy(input, inOff, currentBlock, currentBlockOffset, len); + currentBlockOffset += len; + return; + } - int toCopy = System.Math.Min((len - copied), BlockSize - currentBlockOffset); - Array.Copy(input, copied + inOff, currentBlock, currentBlockOffset, toCopy); - copied += toCopy; - currentBlockOffset += toCopy; + int pos = 0; + if (currentBlockOffset > 0) + { + Array.Copy(input, inOff, currentBlock, currentBlockOffset, available); + pos = available; + ProcessBlock(currentBlock, 0); } + int remaining; + while ((remaining = len - pos) >= BlockSize) + { + ProcessBlock(input, inOff + pos); + pos += BlockSize; + } + + Array.Copy(input, inOff + pos, currentBlock, 0, remaining); + currentBlockOffset = remaining; } - private void ProcessBlock() + private void ProcessBlock(byte[] buf, int off) { - if (currentBlockOffset < BlockSize) +#if NETCOREAPP3_0_OR_GREATER + if (BitConverter.IsLittleEndian) { - currentBlock[currentBlockOffset] = 1; - for (int i = currentBlockOffset + 1; i < BlockSize; i++) - { - currentBlock[i] = 0; - } + Span<uint> t = stackalloc uint[4]; + Unsafe.CopyBlockUnaligned(ref Unsafe.As<uint, byte>(ref t[0]), ref buf[off], 16); + + h0 += t[0] & 0x3ffffffU; + h1 += ((t[1] << 6) | (t[0] >> 26)) & 0x3ffffffU; + h2 += ((t[2] << 12) | (t[1] >> 20)) & 0x3ffffffU; + h3 += ((t[3] << 18) | (t[2] >> 14)) & 0x3ffffffU; + h4 += (1 << 24) | (t[3] >> 8); } - - ulong t0 = Pack.LE_To_UInt32(currentBlock, 0); - ulong t1 = Pack.LE_To_UInt32(currentBlock, 4); - ulong t2 = Pack.LE_To_UInt32(currentBlock, 8); - ulong t3 = Pack.LE_To_UInt32(currentBlock, 12); - - h0 += (uint)(t0 & 0x3ffffffU); - h1 += (uint)((((t1 << 32) | t0) >> 26) & 0x3ffffff); - h2 += (uint)((((t2 << 32) | t1) >> 20) & 0x3ffffff); - h3 += (uint)((((t3 << 32) | t2) >> 14) & 0x3ffffff); - h4 += (uint)(t3 >> 8); - - if (currentBlockOffset == BlockSize) + else +#endif { - h4 += (1 << 24); + uint t0 = Pack.LE_To_UInt32(buf, off + 0); + uint t1 = Pack.LE_To_UInt32(buf, off + 4); + uint t2 = Pack.LE_To_UInt32(buf, off + 8); + uint t3 = Pack.LE_To_UInt32(buf, off + 12); + + h0 += t0 & 0x3ffffffU; + h1 += ((t1 << 6) | (t0 >> 26)) & 0x3ffffffU; + h2 += ((t2 << 12) | (t1 >> 20)) & 0x3ffffffU; + h3 += ((t3 << 18) | (t2 >> 14)) & 0x3ffffffU; + h4 += ( 1 << 24) | (t3 >> 8); } - ulong tp0 = mul32x32_64(h0,r0) + mul32x32_64(h1,s4) + mul32x32_64(h2,s3) + mul32x32_64(h3,s2) + mul32x32_64(h4,s1); - ulong tp1 = mul32x32_64(h0,r1) + mul32x32_64(h1,r0) + mul32x32_64(h2,s4) + mul32x32_64(h3,s3) + mul32x32_64(h4,s2); - ulong tp2 = mul32x32_64(h0,r2) + mul32x32_64(h1,r1) + mul32x32_64(h2,r0) + mul32x32_64(h3,s4) + mul32x32_64(h4,s3); - ulong tp3 = mul32x32_64(h0,r3) + mul32x32_64(h1,r2) + mul32x32_64(h2,r1) + mul32x32_64(h3,r0) + mul32x32_64(h4,s4); - ulong tp4 = mul32x32_64(h0,r4) + mul32x32_64(h1,r3) + mul32x32_64(h2,r2) + mul32x32_64(h3,r1) + mul32x32_64(h4,r0); + ulong tp0 = (ulong)h0 * r0 + (ulong)h1 * s4 + (ulong)h2 * s3 + (ulong)h3 * s2 + (ulong)h4 * s1; + ulong tp1 = (ulong)h0 * r1 + (ulong)h1 * r0 + (ulong)h2 * s4 + (ulong)h3 * s3 + (ulong)h4 * s2; + ulong tp2 = (ulong)h0 * r2 + (ulong)h1 * r1 + (ulong)h2 * r0 + (ulong)h3 * s4 + (ulong)h4 * s3; + ulong tp3 = (ulong)h0 * r3 + (ulong)h1 * r2 + (ulong)h2 * r1 + (ulong)h3 * r0 + (ulong)h4 * s4; + ulong tp4 = (ulong)h0 * r4 + (ulong)h1 * r3 + (ulong)h2 * r2 + (ulong)h3 * r1 + (ulong)h4 * r0; h0 = (uint)tp0 & 0x3ffffff; tp1 += (tp0 >> 26); h1 = (uint)tp1 & 0x3ffffff; tp2 += (tp1 >> 26); @@ -225,7 +244,7 @@ namespace Org.BouncyCastle.Crypto.Macs h3 = (uint)tp3 & 0x3ffffff; tp4 += (tp3 >> 26); h4 = (uint)tp4 & 0x3ffffff; h0 += (uint)(tp4 >> 26) * 5; - h1 += (h0 >> 26); h0 &= 0x3ffffff; + h1 += h0 >> 26; h0 &= 0x3ffffff; } public int DoFinal(byte[] output, int outOff) @@ -235,44 +254,38 @@ namespace Org.BouncyCastle.Crypto.Macs if (currentBlockOffset > 0) { // Process padded block - ProcessBlock(); + if (currentBlockOffset < BlockSize) + { + currentBlock[currentBlockOffset++] = 1; + while (currentBlockOffset < BlockSize) + { + currentBlock[currentBlockOffset++] = 0; + } + + h4 -= (1 << 24); + } + + ProcessBlock(currentBlock, 0); } - h1 += (h0 >> 26); h0 &= 0x3ffffff; - h2 += (h1 >> 26); h1 &= 0x3ffffff; - h3 += (h2 >> 26); h2 &= 0x3ffffff; - h4 += (h3 >> 26); h3 &= 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 &= 0x3ffffff; - h1 += (h0 >> 26); h0 &= 0x3ffffff; - - uint g0, g1, g2, g3, g4, b; - g0 = h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff; - g1 = h1 + b; b = g1 >> 26; g1 &= 0x3ffffff; - g2 = h2 + b; b = g2 >> 26; g2 &= 0x3ffffff; - g3 = h3 + b; b = g3 >> 26; g3 &= 0x3ffffff; - g4 = h4 + b - (1 << 26); - - b = (g4 >> 31) - 1; - uint nb = ~b; - h0 = (h0 & nb) | (g0 & b); - h1 = (h1 & nb) | (g1 & b); - h2 = (h2 & nb) | (g2 & b); - h3 = (h3 & nb) | (g3 & b); - h4 = (h4 & nb) | (g4 & b); - - ulong f0, f1, f2, f3; - f0 = ((h0 ) | (h1 << 26)) + (ulong)k0; - f1 = ((h1 >> 6 ) | (h2 << 20)) + (ulong)k1; - f2 = ((h2 >> 12) | (h3 << 14)) + (ulong)k2; - f3 = ((h3 >> 18) | (h4 << 8 )) + (ulong)k3; - - Pack.UInt32_To_LE((uint)f0, output, outOff); - f1 += (f0 >> 32); - Pack.UInt32_To_LE((uint)f1, output, outOff + 4); - f2 += (f1 >> 32); - Pack.UInt32_To_LE((uint)f2, output, outOff + 8); - f3 += (f2 >> 32); - Pack.UInt32_To_LE((uint)f3, output, outOff + 12); + Debug.Assert(h4 >> 26 == 0); + + //h0 += (h4 >> 26) * 5U + 5U; h4 &= 0x3ffffff; + h0 += 5U; + h1 += h0 >> 26; h0 &= 0x3ffffff; + h2 += h1 >> 26; h1 &= 0x3ffffff; + h3 += h2 >> 26; h2 &= 0x3ffffff; + h4 += h3 >> 26; h3 &= 0x3ffffff; + + long c = ((int)(h4 >> 26) - 1) * 5; + c += (long)k0 + ((h0 ) | (h1 << 26)); + Pack.UInt32_To_LE((uint)c, output, outOff ); c >>= 32; + c += (long)k1 + ((h1 >> 6) | (h2 << 20)); + Pack.UInt32_To_LE((uint)c, output, outOff + 4); c >>= 32; + c += (long)k2 + ((h2 >> 12) | (h3 << 14)); + Pack.UInt32_To_LE((uint)c, output, outOff + 8); c >>= 32; + c += (long)k3 + ((h3 >> 18) | (h4 << 8)); + Pack.UInt32_To_LE((uint)c, output, outOff + 12); Reset(); return BlockSize; @@ -284,10 +297,5 @@ namespace Org.BouncyCastle.Crypto.Macs h0 = h1 = h2 = h3 = h4 = 0; } - - private static ulong mul32x32_64(uint i1, uint i2) - { - return ((ulong)i1) * i2; - } } } diff --git a/crypto/src/crypto/modes/ChaCha20Poly1305.cs b/crypto/src/crypto/modes/ChaCha20Poly1305.cs index 6ca32d9c6..462013200 100644 --- a/crypto/src/crypto/modes/ChaCha20Poly1305.cs +++ b/crypto/src/crypto/modes/ChaCha20Poly1305.cs @@ -221,7 +221,7 @@ namespace Org.BouncyCastle.Crypto.Modes if (++mBufPos == mBuf.Length) { mPoly1305.BlockUpdate(mBuf, 0, BufSize); - ProcessData(mBuf, 0, BufSize, outBytes, outOff); + ProcessBlock(mBuf, 0, outBytes, outOff); Array.Copy(mBuf, BufSize, mBuf, 0, MacSize); this.mBufPos = MacSize; return BufSize; @@ -234,7 +234,7 @@ namespace Org.BouncyCastle.Crypto.Modes mBuf[mBufPos] = input; if (++mBufPos == BufSize) { - ProcessData(mBuf, 0, BufSize, outBytes, outOff); + ProcessBlock(mBuf, 0, outBytes, outOff); mPoly1305.BlockUpdate(outBytes, outOff, BufSize); this.mBufPos = 0; return BufSize; @@ -275,53 +275,99 @@ namespace Org.BouncyCastle.Crypto.Modes { case State.DecData: { - for (int i = 0; i < len; ++i) + int available = mBuf.Length - mBufPos; + if (len < available) { - mBuf[mBufPos] = inBytes[inOff + i]; - if (++mBufPos == mBuf.Length) + Array.Copy(inBytes, inOff, mBuf, mBufPos, len); + mBufPos += len; + break; + } + + if (mBufPos >= BufSize) + { + mPoly1305.BlockUpdate(mBuf, 0, BufSize); + ProcessBlock(mBuf, 0, outBytes, outOff); + Array.Copy(mBuf, BufSize, mBuf, 0, mBufPos -= BufSize); + resultLen = BufSize; + + available += BufSize; + if (len < available) { - mPoly1305.BlockUpdate(mBuf, 0, BufSize); - ProcessData(mBuf, 0, BufSize, outBytes, outOff + resultLen); - Array.Copy(mBuf, BufSize, mBuf, 0, MacSize); - this.mBufPos = MacSize; - resultLen += BufSize; + Array.Copy(inBytes, inOff, mBuf, mBufPos, len); + mBufPos += len; + break; } } + + int inLimit1 = inOff + len - mBuf.Length; + int inLimit2 = inLimit1 - BufSize; + + available = BufSize - mBufPos; + Array.Copy(inBytes, inOff, mBuf, mBufPos, available); + mPoly1305.BlockUpdate(mBuf, 0, BufSize); + ProcessBlock(mBuf, 0, outBytes, outOff + resultLen); + inOff += available; + resultLen += BufSize; + + while (inOff <= inLimit2) + { + mPoly1305.BlockUpdate(inBytes, inOff, BufSize * 2); + ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen); + inOff += BufSize * 2; + resultLen += BufSize * 2; + } + + if (inOff <= inLimit1) + { + mPoly1305.BlockUpdate(inBytes, inOff, BufSize); + ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen); + inOff += BufSize; + resultLen += BufSize; + } + + mBufPos = mBuf.Length + inLimit1 - inOff; + Array.Copy(inBytes, inOff, mBuf, 0, mBufPos); break; } case State.EncData: { - if (mBufPos != 0) + int available = BufSize - mBufPos; + if (len < available) { - while (len > 0) - { - --len; - mBuf[mBufPos] = inBytes[inOff++]; - if (++mBufPos == BufSize) - { - ProcessData(mBuf, 0, BufSize, outBytes, outOff); - mPoly1305.BlockUpdate(outBytes, outOff, BufSize); - this.mBufPos = 0; - resultLen = BufSize; - break; - } - } + Array.Copy(inBytes, inOff, mBuf, mBufPos, len); + mBufPos += len; + break; } - while (len >= BufSize) + int inLimit1 = inOff + len - BufSize; + int inLimit2 = inLimit1 - BufSize; + + if (mBufPos > 0) { - ProcessData(inBytes, inOff, BufSize, outBytes, outOff + resultLen); - mPoly1305.BlockUpdate(outBytes, outOff + resultLen, BufSize); - inOff += BufSize; - len -= BufSize; - resultLen += BufSize; + Array.Copy(inBytes, inOff, mBuf, mBufPos, available); + ProcessBlock(mBuf, 0, outBytes, outOff); + inOff += available; + resultLen = BufSize; + } + + while (inOff <= inLimit2) + { + ProcessBlocks2(inBytes, inOff, outBytes, outOff + resultLen); + inOff += BufSize * 2; + resultLen += BufSize * 2; } - if (len > 0) + if (inOff <= inLimit1) { - Array.Copy(inBytes, inOff, mBuf, 0, len); - this.mBufPos = len; + ProcessBlock(inBytes, inOff, outBytes, outOff + resultLen); + inOff += BufSize; + resultLen += BufSize; } + + mPoly1305.BlockUpdate(outBytes, outOff, resultLen); + + mBufPos = BufSize + inLimit1 - inOff; + Array.Copy(inBytes, inOff, mBuf, 0, mBufPos); break; } default: @@ -500,6 +546,24 @@ namespace Org.BouncyCastle.Crypto.Modes } } + private void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff) + { + Check.OutputLength(outBytes, outOff, 64, "output buffer too short"); + + mChacha20.ProcessBlock(inBytes, inOff, outBytes, outOff); + + this.mDataCount = IncrementCount(mDataCount, 64U, DataLimit); + } + + private void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff) + { + Check.OutputLength(outBytes, outOff, 128, "output buffer too short"); + + mChacha20.ProcessBlocks2(inBytes, inOff, outBytes, outOff); + + this.mDataCount = IncrementCount(mDataCount, 128U, DataLimit); + } + private void ProcessData(byte[] inBytes, int inOff, int inLen, byte[] outBytes, int outOff) { Check.OutputLength(outBytes, outOff, inLen, "output buffer too short"); diff --git a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs index 8d801ed7a..ab78d0ce2 100644 --- a/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs +++ b/crypto/src/tls/crypto/impl/bc/BcChaCha20Poly1305.cs @@ -30,7 +30,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC { int ciphertextLength = inputLength; - m_cipher.ProcessBytes(input, inputOffset, inputLength, output, outputOffset); + m_cipher.DoFinal(input, inputOffset, inputLength, output, outputOffset); int outputLength = inputLength; if (ciphertextLength != outputLength) @@ -63,7 +63,7 @@ namespace Org.BouncyCastle.Tls.Crypto.Impl.BC if (badMac) throw new TlsFatalAlert(AlertDescription.bad_record_mac); - m_cipher.ProcessBytes(input, inputOffset, ciphertextLength, output, outputOffset); + m_cipher.DoFinal(input, inputOffset, ciphertextLength, output, outputOffset); int outputLength = ciphertextLength; if (ciphertextLength != outputLength) |