diff options
author | Peter Dettman <peter.dettman@bouncycastle.org> | 2023-02-16 22:38:56 +0700 |
---|---|---|
committer | Peter Dettman <peter.dettman@bouncycastle.org> | 2023-02-16 22:38:56 +0700 |
commit | 2dbbba4f577c3cd687f67d6ede1d62ea40a7fe1d (patch) | |
tree | e782a95d11c17291592d101a87a2c621f75e3c30 | |
parent | CMS support for key ID + public key recipients in key agreement (diff) | |
download | BouncyCastle.NET-ed25519-2dbbba4f577c3cd687f67d6ede1d62ea40a7fe1d.tar.xz |
GCM perf. opts.
-rw-r--r-- | crypto/src/crypto/modes/GCMBlockCipher.cs | 249 | ||||
-rw-r--r-- | crypto/src/crypto/modes/gcm/GcmUtilities.cs | 102 |
2 files changed, 344 insertions, 7 deletions
diff --git a/crypto/src/crypto/modes/GCMBlockCipher.cs b/crypto/src/crypto/modes/GCMBlockCipher.cs index e7eb9d916..7ed17030b 100644 --- a/crypto/src/crypto/modes/GCMBlockCipher.cs +++ b/crypto/src/crypto/modes/GCMBlockCipher.cs @@ -8,9 +8,13 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif +#if NETCOREAPP3_0_OR_GREATER +using Org.BouncyCastle.Crypto.Engines; +#endif using Org.BouncyCastle.Crypto.Modes.Gcm; using Org.BouncyCastle.Crypto.Parameters; using Org.BouncyCastle.Crypto.Utilities; +using Org.BouncyCastle.Math.EC; using Org.BouncyCastle.Utilities; namespace Org.BouncyCastle.Crypto.Modes @@ -22,6 +26,14 @@ namespace Org.BouncyCastle.Crypto.Modes public sealed class GcmBlockCipher : IAeadBlockCipher { +#if NETCOREAPP3_0_OR_GREATER + private static readonly Vector128<byte> ReverseBytesMask = + Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + private static bool IsFourWaySupported => + Pclmulqdq.IsSupported && Ssse3.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize; +#endif + internal static IGcmMultiplier CreateGcmMultiplier() { #if NETCOREAPP3_0_OR_GREATER @@ -49,6 +61,9 @@ namespace Org.BouncyCastle.Crypto.Modes private byte[] nonce; private byte[] initialAssociatedText; private byte[] H; +#if NETCOREAPP3_0_OR_GREATER + private Vector128<ulong>[] HPow = null; +#endif private byte[] J0; // These fields are modified during processing @@ -172,6 +187,18 @@ namespace Org.BouncyCastle.Crypto.Modes // if keyParam is null we're reusing the last key and the multiplier doesn't need re-init multiplier.Init(H); exp = null; + +#if NETCOREAPP3_0_OR_GREATER + if (IsFourWaySupported) + { + var H1 = GcmUtilities.Load(H); + var H2 = GcmUtilities.Square(H1); + var H3 = GcmUtilities.Multiply(H1, H2); + var H4 = GcmUtilities.Square(H2); + + HPow = new Vector128<ulong>[4]{ H4, H3, H2, H1 }; + } +#endif } else if (this.H == null) { @@ -538,7 +565,14 @@ namespace Org.BouncyCastle.Crypto.Modes //bufOff = 0; } - while (input.Length >= BlockSize * 2) + while (input.Length >= BlockSize * 4) + { + EncryptBlocks4(input, output[resultLen..]); + input = input[(BlockSize * 4)..]; + resultLen += BlockSize * 4; + } + + if (input.Length >= BlockSize * 2) { EncryptBlocks2(input, output[resultLen..]); input = input[(BlockSize * 2)..]; @@ -583,6 +617,7 @@ namespace Org.BouncyCastle.Crypto.Modes int inLimit1 = bufBlock.Length; int inLimit2 = inLimit1 + BlockSize; + int inLimit4 = inLimit1 + BlockSize * 3; available = BlockSize - bufOff; input[..available].CopyTo(bufBlock.AsSpan(bufOff)); @@ -591,7 +626,14 @@ namespace Org.BouncyCastle.Crypto.Modes resultLen += BlockSize; //bufOff = 0; - while (input.Length >= inLimit2) + while (input.Length >= inLimit4) + { + DecryptBlocks4(input, output[resultLen..]); + input = input[(BlockSize * 4)..]; + resultLen += BlockSize * 4; + } + + if (input.Length >= inLimit2) { DecryptBlocks2(input, output[resultLen..]); input = input[(BlockSize * 2)..]; @@ -1027,6 +1069,80 @@ namespace Org.BouncyCastle.Crypto.Modes totalLength += BlockSize * 2; } + private void DecryptBlocks4(ReadOnlySpan<byte> input, Span<byte> output) + { +#if NETCOREAPP3_0_OR_GREATER + if (IsFourWaySupported) + { + Check.OutputLength(output, BlockSize * 4, "output buffer too short"); + + if (totalLength == 0) + { + InitCipher(); + } + + Span<byte> ctrBlocks = stackalloc byte[BlockSize * 4]; + GetNextCtrBlocks4(ctrBlocks); + + var c0 = MemoryMarshal.Read<Vector128<byte>>(input); + var c1 = MemoryMarshal.Read<Vector128<byte>>(input[BlockSize..]); + var c2 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 2)..]); + var c3 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 3)..]); + + var n0 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks); + var n1 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[BlockSize..]); + var n2 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 2)..]); + var n3 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 3)..]); + + var p0 = Sse2.Xor(c0, n0); + var p1 = Sse2.Xor(c1, n1); + var p2 = Sse2.Xor(c2, n2); + var p3 = Sse2.Xor(c3, n3); + + MemoryMarshal.Write(output, ref p0); + MemoryMarshal.Write(output[BlockSize..], ref p1); + MemoryMarshal.Write(output[(BlockSize * 2)..], ref p2); + MemoryMarshal.Write(output[(BlockSize * 3)..], ref p3); + + var t0 = MemoryMarshal.Read<Vector128<byte>>(S.AsSpan()); + c0 = Sse2.Xor(c0, t0); + + c0 = Ssse3.Shuffle(c0, ReverseBytesMask); + c1 = Ssse3.Shuffle(c1, ReverseBytesMask); + c2 = Ssse3.Shuffle(c2, ReverseBytesMask); + c3 = Ssse3.Shuffle(c3, ReverseBytesMask); + + GcmUtilities.MultiplyExt(c0.AsUInt64(), HPow[0], out var U0, out var U1, out var U2); + GcmUtilities.MultiplyExt(c1.AsUInt64(), HPow[1], out var V0, out var V1, out var V2); + GcmUtilities.MultiplyExt(c2.AsUInt64(), HPow[2], out var W0, out var W1, out var W2); + GcmUtilities.MultiplyExt(c3.AsUInt64(), HPow[3], out var X0, out var X1, out var X2); + + U0 = Sse2.Xor(U0, V0); + U1 = Sse2.Xor(U1, V1); + U2 = Sse2.Xor(U2, V2); + + U0 = Sse2.Xor(U0, W0); + U1 = Sse2.Xor(U1, W1); + U2 = Sse2.Xor(U2, W2); + + U0 = Sse2.Xor(U0, X0); + U1 = Sse2.Xor(U1, X1); + U2 = Sse2.Xor(U2, X2); + + var t2 = GcmUtilities.Reduce3(U0, U1, U2).AsByte(); + + t2 = Ssse3.Shuffle(t2, ReverseBytesMask); + MemoryMarshal.Write(S.AsSpan(), ref t2); + + totalLength += BlockSize * 4; + return; + } +#endif + + DecryptBlocks2(input, output); + DecryptBlocks2(input[(BlockSize * 2)..], output[(BlockSize * 2)..]); + } + private void EncryptBlock(ReadOnlySpan<byte> input, Span<byte> output) { Check.OutputLength(output, BlockSize, "output buffer too short"); @@ -1080,7 +1196,7 @@ namespace Org.BouncyCastle.Crypto.Modes private void EncryptBlocks2(ReadOnlySpan<byte> input, Span<byte> output) { - Check.OutputLength(output, BlockSize * 2, "Output buffer too short"); + Check.OutputLength(output, BlockSize * 2, "output buffer too short"); if (totalLength == 0) { @@ -1169,6 +1285,80 @@ namespace Org.BouncyCastle.Crypto.Modes totalLength += BlockSize * 2; } + private void EncryptBlocks4(ReadOnlySpan<byte> input, Span<byte> output) + { +#if NETCOREAPP3_0_OR_GREATER + if (IsFourWaySupported) + { + Check.OutputLength(output, BlockSize * 4, "output buffer too short"); + + if (totalLength == 0) + { + InitCipher(); + } + + Span<byte> ctrBlocks = stackalloc byte[BlockSize * 4]; + GetNextCtrBlocks4(ctrBlocks); + + var p0 = MemoryMarshal.Read<Vector128<byte>>(input); + var p1 = MemoryMarshal.Read<Vector128<byte>>(input[BlockSize..]); + var p2 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 2)..]); + var p3 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 3)..]); + + var n0 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks); + var n1 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[BlockSize..]); + var n2 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 2)..]); + var n3 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 3)..]); + + var c0 = Sse2.Xor(p0, n0); + var c1 = Sse2.Xor(p1, n1); + var c2 = Sse2.Xor(p2, n2); + var c3 = Sse2.Xor(p3, n3); + + MemoryMarshal.Write(output, ref c0); + MemoryMarshal.Write(output[BlockSize..], ref c1); + MemoryMarshal.Write(output[(BlockSize * 2)..], ref c2); + MemoryMarshal.Write(output[(BlockSize * 3)..], ref c3); + + var t0 = MemoryMarshal.Read<Vector128<byte>>(S.AsSpan()); + c0 = Sse2.Xor(c0, t0); + + c0 = Ssse3.Shuffle(c0, ReverseBytesMask); + c1 = Ssse3.Shuffle(c1, ReverseBytesMask); + c2 = Ssse3.Shuffle(c2, ReverseBytesMask); + c3 = Ssse3.Shuffle(c3, ReverseBytesMask); + + GcmUtilities.MultiplyExt(c0.AsUInt64(), HPow[0], out var U0, out var U1, out var U2); + GcmUtilities.MultiplyExt(c1.AsUInt64(), HPow[1], out var V0, out var V1, out var V2); + GcmUtilities.MultiplyExt(c2.AsUInt64(), HPow[2], out var W0, out var W1, out var W2); + GcmUtilities.MultiplyExt(c3.AsUInt64(), HPow[3], out var X0, out var X1, out var X2); + + U0 = Sse2.Xor(U0, V0); + U1 = Sse2.Xor(U1, V1); + U2 = Sse2.Xor(U2, V2); + + U0 = Sse2.Xor(U0, W0); + U1 = Sse2.Xor(U1, W1); + U2 = Sse2.Xor(U2, W2); + + U0 = Sse2.Xor(U0, X0); + U1 = Sse2.Xor(U1, X1); + U2 = Sse2.Xor(U2, X2); + + var t2 = GcmUtilities.Reduce3(U0, U1, U2).AsByte(); + + t2 = Ssse3.Shuffle(t2, ReverseBytesMask); + MemoryMarshal.Write(S.AsSpan(), ref t2); + + totalLength += BlockSize * 4; + return; + } +#endif + + EncryptBlocks2(input, output); + EncryptBlocks2(input[(BlockSize * 2)..], output[(BlockSize * 2)..]); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private void GetNextCtrBlock(Span<byte> block) { @@ -1197,6 +1387,51 @@ namespace Org.BouncyCastle.Crypto.Modes cipher.ProcessBlock(counter, blocks[BlockSize..]); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void GetNextCtrBlocks4(Span<byte> blocks) + { + if (blocksRemaining < 4) + throw new InvalidOperationException("Attempt to process too many blocks"); + + blocksRemaining -= 4; + +#if NETCOREAPP3_0_OR_GREATER + if (AesEngine_X86.IsSupported && cipher is AesEngine_X86 x86) + { + uint counter0 = counter32; + uint counter1 = counter0 + 1U; + uint counter2 = counter0 + 2U; + uint counter3 = counter0 + 3U; + uint counter4 = counter0 + 4U; + counter32 = counter4; + + counter.CopyTo(blocks); + counter.CopyTo(blocks[BlockSize..]); + counter.CopyTo(blocks[(BlockSize * 2)..]); + Pack.UInt32_To_BE(counter4, counter, 12); + Pack.UInt32_To_BE(counter1, blocks[12..]); + Pack.UInt32_To_BE(counter2, blocks[28..]); + Pack.UInt32_To_BE(counter3, blocks[44..]); + counter.CopyTo(blocks[(BlockSize * 3)..]); + + x86.ProcessFourBlocks(blocks, blocks); + return; + } +#endif + + Pack.UInt32_To_BE(++counter32, counter, 12); + cipher.ProcessBlock(counter, blocks); + + Pack.UInt32_To_BE(++counter32, counter, 12); + cipher.ProcessBlock(counter, blocks[BlockSize..]); + + Pack.UInt32_To_BE(++counter32, counter, 12); + cipher.ProcessBlock(counter, blocks[(BlockSize * 2)..]); + + Pack.UInt32_To_BE(++counter32, counter, 12); + cipher.ProcessBlock(counter, blocks[(BlockSize * 3)..]); + } + private void ProcessPartial(Span<byte> partialBlock, Span<byte> output) { Span<byte> ctrBlock = stackalloc byte[BlockSize]; @@ -1219,7 +1454,7 @@ namespace Org.BouncyCastle.Crypto.Modes #else private void DecryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff) { - Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short"); + Check.OutputLength(outBuf, outOff, BlockSize, "output buffer too short"); if (totalLength == 0) { @@ -1255,7 +1490,7 @@ namespace Org.BouncyCastle.Crypto.Modes private void DecryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff) { - Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short"); + Check.OutputLength(outBuf, outOff, BlockSize * 2, "output buffer too short"); if (totalLength == 0) { @@ -1316,7 +1551,7 @@ namespace Org.BouncyCastle.Crypto.Modes private void EncryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff) { - Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short"); + Check.OutputLength(outBuf, outOff, BlockSize, "output buffer too short"); if (totalLength == 0) { @@ -1352,7 +1587,7 @@ namespace Org.BouncyCastle.Crypto.Modes private void EncryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff) { - Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short"); + Check.OutputLength(outBuf, outOff, BlockSize * 2, "output buffer too short"); if (totalLength == 0) { diff --git a/crypto/src/crypto/modes/gcm/GcmUtilities.cs b/crypto/src/crypto/modes/gcm/GcmUtilities.cs index a239e9ec0..b2c74d7d0 100644 --- a/crypto/src/crypto/modes/gcm/GcmUtilities.cs +++ b/crypto/src/crypto/modes/gcm/GcmUtilities.cs @@ -140,6 +140,108 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm x.n1 = z1; } +#if NETCOREAPP3_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128<ulong> Load(byte[] x) + { + AsFieldElement(x, out FieldElement X); + return Vector128.Create(X.n1, X.n0); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128<ulong> Multiply(Vector128<ulong> X, Vector128<ulong> Y) + { + MultiplyExt(X, Y, out var Z0, out var Z1, out var Z2); + return Reduce3(Z0, Z1, Z2); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void MultiplyExt(Vector128<ulong> X, Vector128<ulong> Y, out Vector128<ulong> Z0, + out Vector128<ulong> Z1, out Vector128<ulong> Z2) + { + if (!Pclmulqdq.IsSupported) + throw new PlatformNotSupportedException(nameof(GcmUtilities.MultiplyExt)); + + Z0 = Pclmulqdq.CarrylessMultiply(X, Y, 0x00); + Z1 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X, Y, 0x01), + Pclmulqdq.CarrylessMultiply(X, Y, 0x10)); + Z2 = Pclmulqdq.CarrylessMultiply(X, Y, 0x11); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128<ulong> Reduce2(Vector128<ulong> Z0, Vector128<ulong> Z2) + { + ulong t3 = Z0.GetElement(0); + ulong t2 = Z0.GetElement(1); + ulong t1 = Z2.GetElement(0); + ulong t0 = Z2.GetElement(1); + + ulong z0 = (t0 << 1) | (t1 >> 63); + ulong z1 = (t1 << 1) | (t2 >> 63); + ulong z2 = (t2 << 1) | (t3 >> 63); + ulong z3 = (t3 << 1); + + Debug.Assert(z3 << 63 == 0); + + z1 ^= z3 ^ (z3 >> 1) ^ (z3 >> 2) ^ (z3 >> 7); +// z2 ^= (z3 << 63) ^ (z3 << 62) ^ (z3 << 57); + z2 ^= (z3 << 62) ^ (z3 << 57); + + z0 ^= z2 ^ (z2 >> 1) ^ (z2 >> 2) ^ (z2 >> 7); + z1 ^= (z2 << 63) ^ (z2 << 62) ^ (z2 << 57); + + return Vector128.Create(z1, z0); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128<ulong> Reduce3(Vector128<ulong> Z0, Vector128<ulong> Z1, Vector128<ulong> Z2) + { + ulong t3 = Z0.GetElement(0); + ulong t2 = Z0.GetElement(1) ^ Z1.GetElement(0); + ulong t1 = Z2.GetElement(0) ^ Z1.GetElement(1); + ulong t0 = Z2.GetElement(1); + + ulong z0 = (t0 << 1) | (t1 >> 63); + ulong z1 = (t1 << 1) | (t2 >> 63); + ulong z2 = (t2 << 1) | (t3 >> 63); + ulong z3 = (t3 << 1); + + Debug.Assert(z3 << 63 == 0); + + z1 ^= z3 ^ (z3 >> 1) ^ (z3 >> 2) ^ (z3 >> 7); +// z2 ^= (z3 << 63) ^ (z3 << 62) ^ (z3 << 57); + z2 ^= (z3 << 62) ^ (z3 << 57); + + z0 ^= z2 ^ (z2 >> 1) ^ (z2 >> 2) ^ (z2 >> 7); + z1 ^= (z2 << 63) ^ (z2 << 62) ^ (z2 << 57); + + return Vector128.Create(z1, z0); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void Store(Vector128<ulong> X, byte[] z) + { + AsBytes(X.GetElement(1), X.GetElement(0), z); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128<ulong> Square(Vector128<ulong> X) + { + SquareExt(X, out var Z0, out var Z2); + return Reduce2(Z0, Z2); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SquareExt(Vector128<ulong> X, out Vector128<ulong> Z0, out Vector128<ulong> Z2) + { + if (!Pclmulqdq.IsSupported) + throw new PlatformNotSupportedException(nameof(GcmUtilities.SquareExt)); + + Z0 = Pclmulqdq.CarrylessMultiply(X, X, 0x00); + Z2 = Pclmulqdq.CarrylessMultiply(X, X, 0x11); + } +#endif + internal static void MultiplyP7(ref FieldElement x) { ulong x0 = x.n0, x1 = x.n1; |