summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2023-02-16 22:38:56 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2023-02-16 22:38:56 +0700
commit2dbbba4f577c3cd687f67d6ede1d62ea40a7fe1d (patch)
treee782a95d11c17291592d101a87a2c621f75e3c30
parentCMS support for key ID + public key recipients in key agreement (diff)
downloadBouncyCastle.NET-ed25519-2dbbba4f577c3cd687f67d6ede1d62ea40a7fe1d.tar.xz
GCM perf. opts.
-rw-r--r--crypto/src/crypto/modes/GCMBlockCipher.cs249
-rw-r--r--crypto/src/crypto/modes/gcm/GcmUtilities.cs102
2 files changed, 344 insertions, 7 deletions
diff --git a/crypto/src/crypto/modes/GCMBlockCipher.cs b/crypto/src/crypto/modes/GCMBlockCipher.cs
index e7eb9d916..7ed17030b 100644
--- a/crypto/src/crypto/modes/GCMBlockCipher.cs
+++ b/crypto/src/crypto/modes/GCMBlockCipher.cs
@@ -8,9 +8,13 @@ using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
 
+#if NETCOREAPP3_0_OR_GREATER
+using Org.BouncyCastle.Crypto.Engines;
+#endif
 using Org.BouncyCastle.Crypto.Modes.Gcm;
 using Org.BouncyCastle.Crypto.Parameters;
 using Org.BouncyCastle.Crypto.Utilities;
+using Org.BouncyCastle.Math.EC;
 using Org.BouncyCastle.Utilities;
 
 namespace Org.BouncyCastle.Crypto.Modes
@@ -22,6 +26,14 @@ namespace Org.BouncyCastle.Crypto.Modes
     public sealed class GcmBlockCipher
         : IAeadBlockCipher
     {
+#if NETCOREAPP3_0_OR_GREATER
+        private static readonly Vector128<byte> ReverseBytesMask =
+            Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+        private static bool IsFourWaySupported =>
+            Pclmulqdq.IsSupported && Ssse3.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize;
+#endif
+
         internal static IGcmMultiplier CreateGcmMultiplier()
         {
 #if NETCOREAPP3_0_OR_GREATER
@@ -49,6 +61,9 @@ namespace Org.BouncyCastle.Crypto.Modes
         private byte[]      nonce;
         private byte[]      initialAssociatedText;
         private byte[]      H;
+#if NETCOREAPP3_0_OR_GREATER
+        private Vector128<ulong>[] HPow = null;
+#endif
         private byte[]      J0;
 
         // These fields are modified during processing
@@ -172,6 +187,18 @@ namespace Org.BouncyCastle.Crypto.Modes
                 // if keyParam is null we're reusing the last key and the multiplier doesn't need re-init
                 multiplier.Init(H);
                 exp = null;
+
+#if NETCOREAPP3_0_OR_GREATER
+                if (IsFourWaySupported)
+                {
+                    var H1 = GcmUtilities.Load(H);
+                    var H2 = GcmUtilities.Square(H1);
+                    var H3 = GcmUtilities.Multiply(H1, H2);
+                    var H4 = GcmUtilities.Square(H2);
+
+                    HPow = new Vector128<ulong>[4]{ H4, H3, H2, H1 };
+                }
+#endif
             }
             else if (this.H == null)
             {
@@ -538,7 +565,14 @@ namespace Org.BouncyCastle.Crypto.Modes
                     //bufOff = 0;
                 }
 
-                while (input.Length >= BlockSize * 2)
+                while (input.Length >= BlockSize * 4)
+                {
+                    EncryptBlocks4(input, output[resultLen..]);
+                    input = input[(BlockSize * 4)..];
+                    resultLen += BlockSize * 4;
+                }
+
+                if (input.Length >= BlockSize * 2)
                 {
                     EncryptBlocks2(input, output[resultLen..]);
                     input = input[(BlockSize * 2)..];
@@ -583,6 +617,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
                 int inLimit1 = bufBlock.Length;
                 int inLimit2 = inLimit1 + BlockSize;
+                int inLimit4 = inLimit1 + BlockSize * 3;
 
                 available = BlockSize - bufOff;
                 input[..available].CopyTo(bufBlock.AsSpan(bufOff));
@@ -591,7 +626,14 @@ namespace Org.BouncyCastle.Crypto.Modes
                 resultLen += BlockSize;
                 //bufOff = 0;
 
-                while (input.Length >= inLimit2)
+                while (input.Length >= inLimit4)
+                {
+                    DecryptBlocks4(input, output[resultLen..]);
+                    input = input[(BlockSize * 4)..];
+                    resultLen += BlockSize * 4;
+                }
+
+                if (input.Length >= inLimit2)
                 {
                     DecryptBlocks2(input, output[resultLen..]);
                     input = input[(BlockSize * 2)..];
@@ -1027,6 +1069,80 @@ namespace Org.BouncyCastle.Crypto.Modes
             totalLength += BlockSize * 2;
         }
 
+        private void DecryptBlocks4(ReadOnlySpan<byte> input, Span<byte> output)
+        {
+#if NETCOREAPP3_0_OR_GREATER
+            if (IsFourWaySupported)
+            {
+                Check.OutputLength(output, BlockSize * 4, "output buffer too short");
+
+                if (totalLength == 0)
+                {
+                    InitCipher();
+                }
+
+                Span<byte> ctrBlocks = stackalloc byte[BlockSize * 4];
+                GetNextCtrBlocks4(ctrBlocks);
+
+                var c0 = MemoryMarshal.Read<Vector128<byte>>(input);
+                var c1 = MemoryMarshal.Read<Vector128<byte>>(input[BlockSize..]);
+                var c2 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 2)..]);
+                var c3 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 3)..]);
+
+                var n0 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks);
+                var n1 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[BlockSize..]);
+                var n2 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 2)..]);
+                var n3 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 3)..]);
+
+                var p0 = Sse2.Xor(c0, n0);
+                var p1 = Sse2.Xor(c1, n1);
+                var p2 = Sse2.Xor(c2, n2);
+                var p3 = Sse2.Xor(c3, n3);
+
+                MemoryMarshal.Write(output, ref p0);
+                MemoryMarshal.Write(output[BlockSize..], ref p1);
+                MemoryMarshal.Write(output[(BlockSize * 2)..], ref p2);
+                MemoryMarshal.Write(output[(BlockSize * 3)..], ref p3);
+
+                var t0 = MemoryMarshal.Read<Vector128<byte>>(S.AsSpan());
+                c0 = Sse2.Xor(c0, t0);
+
+                c0 = Ssse3.Shuffle(c0, ReverseBytesMask);
+                c1 = Ssse3.Shuffle(c1, ReverseBytesMask);
+                c2 = Ssse3.Shuffle(c2, ReverseBytesMask);
+                c3 = Ssse3.Shuffle(c3, ReverseBytesMask);
+
+                GcmUtilities.MultiplyExt(c0.AsUInt64(), HPow[0], out var U0, out var U1, out var U2);
+                GcmUtilities.MultiplyExt(c1.AsUInt64(), HPow[1], out var V0, out var V1, out var V2);
+                GcmUtilities.MultiplyExt(c2.AsUInt64(), HPow[2], out var W0, out var W1, out var W2);
+                GcmUtilities.MultiplyExt(c3.AsUInt64(), HPow[3], out var X0, out var X1, out var X2);
+
+                U0 = Sse2.Xor(U0, V0);
+                U1 = Sse2.Xor(U1, V1);
+                U2 = Sse2.Xor(U2, V2);
+
+                U0 = Sse2.Xor(U0, W0);
+                U1 = Sse2.Xor(U1, W1);
+                U2 = Sse2.Xor(U2, W2);
+
+                U0 = Sse2.Xor(U0, X0);
+                U1 = Sse2.Xor(U1, X1);
+                U2 = Sse2.Xor(U2, X2);
+
+                var t2 = GcmUtilities.Reduce3(U0, U1, U2).AsByte();
+
+                t2 = Ssse3.Shuffle(t2, ReverseBytesMask);
+                MemoryMarshal.Write(S.AsSpan(), ref t2);
+
+                totalLength += BlockSize * 4;
+                return;
+            }
+#endif
+
+            DecryptBlocks2(input, output);
+            DecryptBlocks2(input[(BlockSize * 2)..], output[(BlockSize * 2)..]);
+        }
+
         private void EncryptBlock(ReadOnlySpan<byte> input, Span<byte> output)
         {
             Check.OutputLength(output, BlockSize, "output buffer too short");
@@ -1080,7 +1196,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
         private void EncryptBlocks2(ReadOnlySpan<byte> input, Span<byte> output)
         {
-            Check.OutputLength(output, BlockSize * 2, "Output buffer too short");
+            Check.OutputLength(output, BlockSize * 2, "output buffer too short");
 
             if (totalLength == 0)
             {
@@ -1169,6 +1285,80 @@ namespace Org.BouncyCastle.Crypto.Modes
             totalLength += BlockSize * 2;
         }
 
+        private void EncryptBlocks4(ReadOnlySpan<byte> input, Span<byte> output)
+        {
+#if NETCOREAPP3_0_OR_GREATER
+            if (IsFourWaySupported)
+            {
+                Check.OutputLength(output, BlockSize * 4, "output buffer too short");
+
+                if (totalLength == 0)
+                {
+                    InitCipher();
+                }
+
+                Span<byte> ctrBlocks = stackalloc byte[BlockSize * 4];
+                GetNextCtrBlocks4(ctrBlocks);
+
+                var p0 = MemoryMarshal.Read<Vector128<byte>>(input);
+                var p1 = MemoryMarshal.Read<Vector128<byte>>(input[BlockSize..]);
+                var p2 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 2)..]);
+                var p3 = MemoryMarshal.Read<Vector128<byte>>(input[(BlockSize * 3)..]);
+
+                var n0 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks);
+                var n1 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[BlockSize..]);
+                var n2 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 2)..]);
+                var n3 = MemoryMarshal.Read<Vector128<byte>>(ctrBlocks[(BlockSize * 3)..]);
+
+                var c0 = Sse2.Xor(p0, n0);
+                var c1 = Sse2.Xor(p1, n1);
+                var c2 = Sse2.Xor(p2, n2);
+                var c3 = Sse2.Xor(p3, n3);
+
+                MemoryMarshal.Write(output, ref c0);
+                MemoryMarshal.Write(output[BlockSize..], ref c1);
+                MemoryMarshal.Write(output[(BlockSize * 2)..], ref c2);
+                MemoryMarshal.Write(output[(BlockSize * 3)..], ref c3);
+
+                var t0 = MemoryMarshal.Read<Vector128<byte>>(S.AsSpan());
+                c0 = Sse2.Xor(c0, t0);
+
+                c0 = Ssse3.Shuffle(c0, ReverseBytesMask);
+                c1 = Ssse3.Shuffle(c1, ReverseBytesMask);
+                c2 = Ssse3.Shuffle(c2, ReverseBytesMask);
+                c3 = Ssse3.Shuffle(c3, ReverseBytesMask);
+
+                GcmUtilities.MultiplyExt(c0.AsUInt64(), HPow[0], out var U0, out var U1, out var U2);
+                GcmUtilities.MultiplyExt(c1.AsUInt64(), HPow[1], out var V0, out var V1, out var V2);
+                GcmUtilities.MultiplyExt(c2.AsUInt64(), HPow[2], out var W0, out var W1, out var W2);
+                GcmUtilities.MultiplyExt(c3.AsUInt64(), HPow[3], out var X0, out var X1, out var X2);
+
+                U0 = Sse2.Xor(U0, V0);
+                U1 = Sse2.Xor(U1, V1);
+                U2 = Sse2.Xor(U2, V2);
+
+                U0 = Sse2.Xor(U0, W0);
+                U1 = Sse2.Xor(U1, W1);
+                U2 = Sse2.Xor(U2, W2);
+
+                U0 = Sse2.Xor(U0, X0);
+                U1 = Sse2.Xor(U1, X1);
+                U2 = Sse2.Xor(U2, X2);
+
+                var t2 = GcmUtilities.Reduce3(U0, U1, U2).AsByte();
+
+                t2 = Ssse3.Shuffle(t2, ReverseBytesMask);
+                MemoryMarshal.Write(S.AsSpan(), ref t2);
+
+                totalLength += BlockSize * 4;
+                return;
+            }
+#endif
+
+            EncryptBlocks2(input, output);
+            EncryptBlocks2(input[(BlockSize * 2)..], output[(BlockSize * 2)..]);
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private void GetNextCtrBlock(Span<byte> block)
         {
@@ -1197,6 +1387,51 @@ namespace Org.BouncyCastle.Crypto.Modes
             cipher.ProcessBlock(counter, blocks[BlockSize..]);
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void GetNextCtrBlocks4(Span<byte> blocks)
+        {
+            if (blocksRemaining < 4)
+                throw new InvalidOperationException("Attempt to process too many blocks");
+
+            blocksRemaining -= 4;
+
+#if NETCOREAPP3_0_OR_GREATER
+            if (AesEngine_X86.IsSupported && cipher is AesEngine_X86 x86)
+            {
+                uint counter0 = counter32;
+                uint counter1 = counter0 + 1U;
+                uint counter2 = counter0 + 2U;
+                uint counter3 = counter0 + 3U;
+                uint counter4 = counter0 + 4U;
+                counter32 = counter4;
+
+                counter.CopyTo(blocks);
+                counter.CopyTo(blocks[BlockSize..]);
+                counter.CopyTo(blocks[(BlockSize * 2)..]);
+                Pack.UInt32_To_BE(counter4, counter, 12);
+                Pack.UInt32_To_BE(counter1, blocks[12..]);
+                Pack.UInt32_To_BE(counter2, blocks[28..]);
+                Pack.UInt32_To_BE(counter3, blocks[44..]);
+                counter.CopyTo(blocks[(BlockSize * 3)..]);
+
+                x86.ProcessFourBlocks(blocks, blocks);
+                return;
+            }
+#endif
+
+            Pack.UInt32_To_BE(++counter32, counter, 12);
+            cipher.ProcessBlock(counter, blocks);
+
+            Pack.UInt32_To_BE(++counter32, counter, 12);
+            cipher.ProcessBlock(counter, blocks[BlockSize..]);
+
+            Pack.UInt32_To_BE(++counter32, counter, 12);
+            cipher.ProcessBlock(counter, blocks[(BlockSize * 2)..]);
+
+            Pack.UInt32_To_BE(++counter32, counter, 12);
+            cipher.ProcessBlock(counter, blocks[(BlockSize * 3)..]);
+        }
+
         private void ProcessPartial(Span<byte> partialBlock, Span<byte> output)
         {
             Span<byte> ctrBlock = stackalloc byte[BlockSize];
@@ -1219,7 +1454,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 #else
         private void DecryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
         {
-            Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short");
+            Check.OutputLength(outBuf, outOff, BlockSize, "output buffer too short");
 
             if (totalLength == 0)
             {
@@ -1255,7 +1490,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
         private void DecryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
         {
-            Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short");
+            Check.OutputLength(outBuf, outOff, BlockSize * 2, "output buffer too short");
 
             if (totalLength == 0)
             {
@@ -1316,7 +1551,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
         private void EncryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
         {
-            Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short");
+            Check.OutputLength(outBuf, outOff, BlockSize, "output buffer too short");
 
             if (totalLength == 0)
             {
@@ -1352,7 +1587,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
         private void EncryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
         {
-            Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short");
+            Check.OutputLength(outBuf, outOff, BlockSize * 2, "output buffer too short");
 
             if (totalLength == 0)
             {
diff --git a/crypto/src/crypto/modes/gcm/GcmUtilities.cs b/crypto/src/crypto/modes/gcm/GcmUtilities.cs
index a239e9ec0..b2c74d7d0 100644
--- a/crypto/src/crypto/modes/gcm/GcmUtilities.cs
+++ b/crypto/src/crypto/modes/gcm/GcmUtilities.cs
@@ -140,6 +140,108 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             x.n1 = z1;
         }
 
+#if NETCOREAPP3_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ulong> Load(byte[] x)
+        {
+            AsFieldElement(x, out FieldElement X);
+            return Vector128.Create(X.n1, X.n0);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ulong> Multiply(Vector128<ulong> X, Vector128<ulong> Y)
+        {
+            MultiplyExt(X, Y, out var Z0, out var Z1, out var Z2);
+            return Reduce3(Z0, Z1, Z2);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void MultiplyExt(Vector128<ulong> X, Vector128<ulong> Y, out Vector128<ulong> Z0,
+            out Vector128<ulong> Z1, out Vector128<ulong> Z2)
+        {
+            if (!Pclmulqdq.IsSupported)
+                throw new PlatformNotSupportedException(nameof(GcmUtilities.MultiplyExt));
+
+            Z0 =          Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
+            Z1 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X, Y, 0x01),
+                          Pclmulqdq.CarrylessMultiply(X, Y, 0x10));
+            Z2 =          Pclmulqdq.CarrylessMultiply(X, Y, 0x11);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ulong> Reduce2(Vector128<ulong> Z0, Vector128<ulong> Z2)
+        {
+            ulong t3 = Z0.GetElement(0);
+            ulong t2 = Z0.GetElement(1);
+            ulong t1 = Z2.GetElement(0);
+            ulong t0 = Z2.GetElement(1);
+
+            ulong z0 = (t0 << 1) | (t1 >> 63);
+            ulong z1 = (t1 << 1) | (t2 >> 63);
+            ulong z2 = (t2 << 1) | (t3 >> 63);
+            ulong z3 = (t3 << 1);
+
+            Debug.Assert(z3 << 63 == 0);
+
+            z1 ^= z3 ^ (z3 >>  1) ^ (z3 >>  2) ^ (z3 >>  7);
+//          z2 ^=      (z3 << 63) ^ (z3 << 62) ^ (z3 << 57);
+            z2 ^=                   (z3 << 62) ^ (z3 << 57);
+
+            z0 ^= z2 ^ (z2 >>  1) ^ (z2 >>  2) ^ (z2 >>  7);
+            z1 ^=      (z2 << 63) ^ (z2 << 62) ^ (z2 << 57);
+
+            return Vector128.Create(z1, z0);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ulong> Reduce3(Vector128<ulong> Z0, Vector128<ulong> Z1, Vector128<ulong> Z2)
+        {
+            ulong t3 = Z0.GetElement(0);
+            ulong t2 = Z0.GetElement(1) ^ Z1.GetElement(0);
+            ulong t1 = Z2.GetElement(0) ^ Z1.GetElement(1);
+            ulong t0 = Z2.GetElement(1);
+
+            ulong z0 = (t0 << 1) | (t1 >> 63);
+            ulong z1 = (t1 << 1) | (t2 >> 63);
+            ulong z2 = (t2 << 1) | (t3 >> 63);
+            ulong z3 = (t3 << 1);
+
+            Debug.Assert(z3 << 63 == 0);
+
+            z1 ^= z3 ^ (z3 >>  1) ^ (z3 >>  2) ^ (z3 >>  7);
+//          z2 ^=      (z3 << 63) ^ (z3 << 62) ^ (z3 << 57);
+            z2 ^=                   (z3 << 62) ^ (z3 << 57);
+
+            z0 ^= z2 ^ (z2 >>  1) ^ (z2 >>  2) ^ (z2 >>  7);
+            z1 ^=      (z2 << 63) ^ (z2 << 62) ^ (z2 << 57);
+
+            return Vector128.Create(z1, z0);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void Store(Vector128<ulong> X, byte[] z)
+        {
+            AsBytes(X.GetElement(1), X.GetElement(0), z);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ulong> Square(Vector128<ulong> X)
+        {
+            SquareExt(X, out var Z0, out var Z2);
+            return Reduce2(Z0, Z2);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void SquareExt(Vector128<ulong> X, out Vector128<ulong> Z0, out Vector128<ulong> Z2)
+        {
+            if (!Pclmulqdq.IsSupported)
+                throw new PlatformNotSupportedException(nameof(GcmUtilities.SquareExt));
+
+            Z0 = Pclmulqdq.CarrylessMultiply(X, X, 0x00);
+            Z2 = Pclmulqdq.CarrylessMultiply(X, X, 0x11);
+        }
+#endif
+
         internal static void MultiplyP7(ref FieldElement x)
         {
             ulong x0 = x.n0, x1 = x.n1;