summary refs log tree commit diff
path: root/crypto/src
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2022-07-24 21:51:13 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2022-07-24 21:51:13 +0700
commitf802d60b4077d8c190995ea88885c46faa813d1a (patch)
treefa2bef08a7c5f98cbfc1e753838a4b3c8a1753d2 /crypto/src
parentmoved picnic GetParameters to Parameters (diff)
downloadBouncyCastle.NET-ed25519-f802d60b4077d8c190995ea88885c46faa813d1a.tar.xz
GCM perf. opts.
Diffstat (limited to 'crypto/src')
-rw-r--r--crypto/src/crypto/modes/GCMBlockCipher.cs432
-rw-r--r--crypto/src/crypto/modes/gcm/GcmUtilities.cs358
-rw-r--r--crypto/src/crypto/modes/gcm/Tables4kGcmMultiplier.cs3
-rw-r--r--crypto/src/crypto/modes/gcm/Tables64kGcmMultiplier.cs3
-rw-r--r--crypto/src/crypto/modes/gcm/Tables8kGcmMultiplier.cs3
5 files changed, 423 insertions, 376 deletions
diff --git a/crypto/src/crypto/modes/GCMBlockCipher.cs b/crypto/src/crypto/modes/GCMBlockCipher.cs
index 9250097cd..ac54e9762 100644
--- a/crypto/src/crypto/modes/GCMBlockCipher.cs
+++ b/crypto/src/crypto/modes/GCMBlockCipher.cs
@@ -1,4 +1,9 @@
 using System;
+#if NETCOREAPP3_0_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 using Org.BouncyCastle.Crypto.Modes.Gcm;
 using Org.BouncyCastle.Crypto.Parameters;
@@ -48,6 +53,7 @@ namespace Org.BouncyCastle.Crypto.Modes
         private byte[]		macBlock;
         private byte[]      S, S_at, S_atPre;
         private byte[]      counter;
+        private uint        counter32;
         private uint        blocksRemaining;
         private int         bufOff;
         private ulong		totalLength;
@@ -210,6 +216,7 @@ namespace Org.BouncyCastle.Crypto.Modes
             this.atLength = 0;
             this.atLengthPre = 0;
             this.counter = Arrays.Clone(J0);
+            this.counter32 = Pack.BE_To_UInt32(counter, 12);
             this.blocksRemaining = uint.MaxValue - 1; // page 8, len(P) <= 2^39 - 256, 1 block used by tag
             this.bufOff = 0;
             this.totalLength = 0;
@@ -273,17 +280,35 @@ namespace Org.BouncyCastle.Crypto.Modes
         {
             CheckStatus();
 
-            for (int i = 0; i < len; ++i)
+            if (atBlockPos > 0)
             {
-                atBlock[atBlockPos] = inBytes[inOff + i];
-                if (++atBlockPos == BlockSize)
+                int available = BlockSize - atBlockPos;
+                if (len < available)
                 {
-                    // Hash each block as it fills
-                    gHASHBlock(S_at, atBlock);
-                    atBlockPos = 0;
-                    atLength += BlockSize;
+                    Array.Copy(inBytes, inOff, atBlock, atBlockPos, len);
+                    atBlockPos += len;
+                    return;
                 }
+
+                Array.Copy(inBytes, inOff, atBlock, atBlockPos, available);
+                gHASHBlock(S_at, atBlock);
+                atLength += BlockSize;
+                inOff += available;
+                len -= available;
+                //atBlockPos = 0;
+            }
+
+            int inLimit = inOff + len - BlockSize;
+
+            while (inOff <= inLimit)
+            {
+                gHASHBlock(S_at, inBytes, inOff);
+                atLength += BlockSize;
+                inOff += BlockSize;
             }
+
+            atBlockPos = BlockSize + inLimit - inOff;
+            Array.Copy(inBytes, inOff, atBlock, 0, atBlockPos);
         }
 
         private void InitCipher()
@@ -307,23 +332,21 @@ namespace Org.BouncyCastle.Crypto.Modes
             }
         }
 
-        public virtual int ProcessByte(
-            byte	input,
-            byte[]	output,
-            int		outOff)
+        public virtual int ProcessByte(byte	input, byte[] output, int outOff)
         {
             CheckStatus();
 
             bufBlock[bufOff] = input;
             if (++bufOff == bufBlock.Length)
             {
-                ProcessBlock(bufBlock, 0, output, outOff);
                 if (forEncryption)
                 {
+                    EncryptBlock(bufBlock, 0, output, outOff);
                     bufOff = 0;
                 }
                 else
                 {
+                    DecryptBlock(bufBlock, 0, output, outOff);
                     Array.Copy(bufBlock, BlockSize, bufBlock, 0, macSize);
                     bufOff = macSize;
                 }
@@ -332,12 +355,7 @@ namespace Org.BouncyCastle.Crypto.Modes
             return 0;
         }
 
-        public virtual int ProcessBytes(
-            byte[]	input,
-            int		inOff,
-            int		len,
-            byte[]	output,
-            int		outOff)
+        public virtual int ProcessBytes(byte[] input, int inOff, int len, byte[] output, int outOff)
         {
             CheckStatus();
 
@@ -347,49 +365,95 @@ namespace Org.BouncyCastle.Crypto.Modes
 
             if (forEncryption)
             {
-                if (bufOff != 0)
+                if (bufOff > 0)
                 {
-                    while (len > 0)
+                    int available = BlockSize - bufOff;
+                    if (len < available)
                     {
-                        --len;
-                        bufBlock[bufOff] = input[inOff++];
-                        if (++bufOff == BlockSize)
-                        {
-                            ProcessBlock(bufBlock, 0, output, outOff);
-                            bufOff = 0;
-                            resultLen += BlockSize;
-                            break;
-                        }
+                        Array.Copy(input, inOff, bufBlock, bufOff, len);
+                        bufOff += len;
+                        return 0;
                     }
+
+                    Array.Copy(input, inOff, bufBlock, bufOff, available);
+                    EncryptBlock(bufBlock, 0, output, outOff);
+                    inOff += available;
+                    len -= available;
+                    resultLen = BlockSize;
+                    //bufOff = 0;
                 }
 
-                while (len >= BlockSize)
+                int inLimit1 = inOff + len - BlockSize;
+                int inLimit2 = inLimit1 - BlockSize;
+
+                while (inOff <= inLimit2)
                 {
-                    ProcessBlock(input, inOff, output, outOff + resultLen);
-                    inOff += BlockSize;
-                    len -= BlockSize;
-                    resultLen += BlockSize;
+                    EncryptBlocks2(input, inOff, output, outOff + resultLen);
+                    inOff += BlockSize * 2;
+                    resultLen += BlockSize * 2;
                 }
 
-                if (len > 0)
+                if (inOff <= inLimit1)
                 {
-                    Array.Copy(input, inOff, bufBlock, 0, len);
-                    bufOff = len;
+                    EncryptBlock(input, inOff, output, outOff + resultLen);
+                    inOff += BlockSize;
+                    resultLen += BlockSize;
                 }
+
+                bufOff = BlockSize + inLimit1 - inOff;
+                Array.Copy(input, inOff, bufBlock, 0, bufOff);
             }
             else
             {
-                for (int i = 0; i < len; ++i)
+                int available = bufBlock.Length - bufOff;
+                if (len < available)
+                {
+                    Array.Copy(input, inOff, bufBlock, bufOff, len);
+                    bufOff += len;
+                    return 0;
+                }
+
+                if (bufOff >= BlockSize)
                 {
-                    bufBlock[bufOff] = input[inOff + i];
-                    if (++bufOff == bufBlock.Length)
+                    DecryptBlock(bufBlock, 0, output, outOff);
+                    Array.Copy(bufBlock, BlockSize, bufBlock, 0, bufOff -= BlockSize);
+                    resultLen = BlockSize;
+
+                    available += BlockSize;
+                    if (len < available)
                     {
-                        ProcessBlock(bufBlock, 0, output, outOff + resultLen);
-                        Array.Copy(bufBlock, BlockSize, bufBlock, 0, macSize);
-                        bufOff = macSize;
-                        resultLen += BlockSize;
+                        Array.Copy(input, inOff, bufBlock, bufOff, len);
+                        bufOff += len;
+                        return resultLen;
                     }
                 }
+
+                int inLimit1 = inOff + len - bufBlock.Length;
+                int inLimit2 = inLimit1 - BlockSize;
+
+                available = BlockSize - bufOff;
+                Array.Copy(input, inOff, bufBlock, bufOff, available);
+                DecryptBlock(bufBlock, 0, output, outOff + resultLen);
+                inOff += available;
+                resultLen += BlockSize;
+                //bufOff = 0;
+
+                while (inOff <= inLimit2)
+                {
+                    DecryptBlocks2(input, inOff, output, outOff + resultLen);
+                    inOff += BlockSize * 2;
+                    resultLen += BlockSize * 2;
+                }
+
+                if (inOff <= inLimit1)
+                {
+                    DecryptBlock(input, inOff, output, outOff + resultLen);
+                    inOff += BlockSize;
+                    resultLen += BlockSize;
+                }
+
+                bufOff = bufBlock.Length + inLimit1 - inOff;
+                Array.Copy(input, inOff, bufBlock, 0, bufOff);
             }
 
             return resultLen;
@@ -525,6 +589,7 @@ namespace Org.BouncyCastle.Crypto.Modes
             atLength = 0;
             atLengthPre = 0;
             counter = Arrays.Clone(J0);
+            counter32 = Pack.BE_To_UInt32(counter, 12);
             blocksRemaining = uint.MaxValue - 1;
             bufOff = 0;
             totalLength = 0;
@@ -552,9 +617,9 @@ namespace Org.BouncyCastle.Crypto.Modes
             }
         }
 
-        private void ProcessBlock(byte[] buf, int bufOff, byte[] output, int outOff)
+        private void DecryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
         {
-            Check.OutputLength(output, outOff, BlockSize, "Output buffer too short");
+            Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short");
 
             if (totalLength == 0)
             {
@@ -562,23 +627,280 @@ namespace Org.BouncyCastle.Crypto.Modes
             }
 
             byte[] ctrBlock = new byte[BlockSize];
+
             GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
 
-            if (forEncryption)
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t0);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
+            }
+            else
+#endif
+            {
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = inBuf[inOff + i + 0];
+                    byte c1 = inBuf[inOff + i + 1];
+                    byte c2 = inBuf[inOff + i + 2];
+                    byte c3 = inBuf[inOff + i + 3];
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = (byte)(c0 ^ ctrBlock[i + 0]);
+                    outBuf[outOff + i + 1] = (byte)(c1 ^ ctrBlock[i + 1]);
+                    outBuf[outOff + i + 2] = (byte)(c2 ^ ctrBlock[i + 2]);
+                    outBuf[outOff + i + 3] = (byte)(c3 ^ ctrBlock[i + 3]);
+                }
+            }
+            multiplier.MultiplyH(S);
+
+            totalLength += BlockSize;
+        }
+
+        private void DecryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
+        {
+            Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short");
+
+            if (totalLength == 0)
             {
-                GcmUtilities.Xor(ctrBlock, buf, bufOff);
-                gHASHBlock(S, ctrBlock);
-                Array.Copy(ctrBlock, 0, output, outOff, BlockSize);
+                InitCipher();
+            }
+
+            byte[] ctrBlock = new byte[BlockSize];
+
+            GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
+
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t0);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
             }
             else
+#endif
             {
-                gHASHBlock(S, buf, bufOff);
-                GcmUtilities.Xor(ctrBlock, 0, buf, bufOff, output, outOff);
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = inBuf[inOff + i + 0];
+                    byte c1 = inBuf[inOff + i + 1];
+                    byte c2 = inBuf[inOff + i + 2];
+                    byte c3 = inBuf[inOff + i + 3];
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = (byte)(c0 ^ ctrBlock[i + 0]);
+                    outBuf[outOff + i + 1] = (byte)(c1 ^ ctrBlock[i + 1]);
+                    outBuf[outOff + i + 2] = (byte)(c2 ^ ctrBlock[i + 2]);
+                    outBuf[outOff + i + 3] = (byte)(c3 ^ ctrBlock[i + 3]);
+                }
             }
+            multiplier.MultiplyH(S);
+
+            inOff += BlockSize;
+            outOff += BlockSize;
+
+            GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
+
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t0);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
+            }
+            else
+#endif
+            {
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = inBuf[inOff + i + 0];
+                    byte c1 = inBuf[inOff + i + 1];
+                    byte c2 = inBuf[inOff + i + 2];
+                    byte c3 = inBuf[inOff + i + 3];
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = (byte)(c0 ^ ctrBlock[i + 0]);
+                    outBuf[outOff + i + 1] = (byte)(c1 ^ ctrBlock[i + 1]);
+                    outBuf[outOff + i + 2] = (byte)(c2 ^ ctrBlock[i + 2]);
+                    outBuf[outOff + i + 3] = (byte)(c3 ^ ctrBlock[i + 3]);
+                }
+            }
+            multiplier.MultiplyH(S);
+
+            totalLength += BlockSize * 2;
+        }
+
+        private void EncryptBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
+        {
+            Check.OutputLength(outBuf, outOff, BlockSize, "Output buffer too short");
+
+            if (totalLength == 0)
+            {
+                InitCipher();
+            }
+
+            byte[] ctrBlock = new byte[BlockSize];
+
+            GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
+
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t1);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
+            }
+            else
+#endif
+            {
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = (byte)(ctrBlock[i + 0] ^ inBuf[inOff + i + 0]);
+                    byte c1 = (byte)(ctrBlock[i + 1] ^ inBuf[inOff + i + 1]);
+                    byte c2 = (byte)(ctrBlock[i + 2] ^ inBuf[inOff + i + 2]);
+                    byte c3 = (byte)(ctrBlock[i + 3] ^ inBuf[inOff + i + 3]);
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = c0;
+                    outBuf[outOff + i + 1] = c1;
+                    outBuf[outOff + i + 2] = c2;
+                    outBuf[outOff + i + 3] = c3;
+                }
+            }
+            multiplier.MultiplyH(S);
 
             totalLength += BlockSize;
         }
 
+        private void EncryptBlocks2(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
+        {
+            Check.OutputLength(outBuf, outOff, BlockSize * 2, "Output buffer too short");
+
+            if (totalLength == 0)
+            {
+                InitCipher();
+            }
+
+            byte[] ctrBlock = new byte[BlockSize];
+
+            GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
+
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t1);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
+            }
+            else
+#endif
+            {
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = (byte)(ctrBlock[i + 0] ^ inBuf[inOff + i + 0]);
+                    byte c1 = (byte)(ctrBlock[i + 1] ^ inBuf[inOff + i + 1]);
+                    byte c2 = (byte)(ctrBlock[i + 2] ^ inBuf[inOff + i + 2]);
+                    byte c3 = (byte)(ctrBlock[i + 3] ^ inBuf[inOff + i + 3]);
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = c0;
+                    outBuf[outOff + i + 1] = c1;
+                    outBuf[outOff + i + 2] = c2;
+                    outBuf[outOff + i + 3] = c3;
+                }
+            }
+            multiplier.MultiplyH(S);
+
+            inOff += BlockSize;
+            outOff += BlockSize;
+
+            GetNextCtrBlock(ctrBlock);
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported && Unsafe.SizeOf<Vector128<byte>>() == BlockSize)
+            {
+                var t0 = Unsafe.ReadUnaligned<Vector128<byte>>(ref inBuf[inOff]);
+                var t1 = Unsafe.ReadUnaligned<Vector128<byte>>(ref ctrBlock[0]);
+                var t2 = Unsafe.ReadUnaligned<Vector128<byte>>(ref S[0]);
+
+                t1 = Sse2.Xor(t1, t0);
+                t2 = Sse2.Xor(t2, t1);
+
+                Unsafe.WriteUnaligned(ref outBuf[outOff], t1);
+                Unsafe.WriteUnaligned(ref S[0], t2);
+            }
+            else
+#endif
+            {
+                for (int i = 0; i < BlockSize; i += 4)
+                {
+                    byte c0 = (byte)(ctrBlock[i + 0] ^ inBuf[inOff + i + 0]);
+                    byte c1 = (byte)(ctrBlock[i + 1] ^ inBuf[inOff + i + 1]);
+                    byte c2 = (byte)(ctrBlock[i + 2] ^ inBuf[inOff + i + 2]);
+                    byte c3 = (byte)(ctrBlock[i + 3] ^ inBuf[inOff + i + 3]);
+
+                    S[i + 0] ^= c0;
+                    S[i + 1] ^= c1;
+                    S[i + 2] ^= c2;
+                    S[i + 3] ^= c3;
+
+                    outBuf[outOff + i + 0] = c0;
+                    outBuf[outOff + i + 1] = c1;
+                    outBuf[outOff + i + 2] = c2;
+                    outBuf[outOff + i + 3] = c3;
+                }
+            }
+            multiplier.MultiplyH(S);
+
+            totalLength += BlockSize * 2;
+        }
+
         private void ProcessPartial(byte[] buf, int off, int len, byte[] output, int outOff)
         {
             byte[] ctrBlock = new byte[BlockSize];
@@ -633,11 +955,7 @@ namespace Org.BouncyCastle.Crypto.Modes
 
             blocksRemaining--;
 
-            uint c = 1;
-            c += counter[15]; counter[15] = (byte)c; c >>= 8;
-            c += counter[14]; counter[14] = (byte)c; c >>= 8;
-            c += counter[13]; counter[13] = (byte)c; c >>= 8;
-            c += counter[12]; counter[12] = (byte)c;
+            Pack.UInt32_To_BE(++counter32, counter, 12);
 
             cipher.ProcessBlock(counter, 0, block, 0);
         }
diff --git a/crypto/src/crypto/modes/gcm/GcmUtilities.cs b/crypto/src/crypto/modes/gcm/GcmUtilities.cs
index cf21ace23..4528e172a 100644
--- a/crypto/src/crypto/modes/gcm/GcmUtilities.cs
+++ b/crypto/src/crypto/modes/gcm/GcmUtilities.cs
@@ -1,5 +1,8 @@
 using System;
 using System.Diagnostics;
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+using System.Runtime.CompilerServices;
+#endif
 #if NETCOREAPP3_0_OR_GREATER
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -13,6 +16,12 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
 {
     internal abstract class GcmUtilities
     {
+#if NETCOREAPP3_0_OR_GREATER
+        private static readonly Vector128<byte> EndianMask = Vector128.Create(
+            (byte)0x07, (byte)0x06, (byte)0x05, (byte)0x04, (byte)0x03, (byte)0x02, (byte)0x01, (byte)0x00,
+            (byte)0x0F, (byte)0x0E, (byte)0x0D, (byte)0x0C, (byte)0x0B, (byte)0x0A, (byte)0x09, (byte)0x08);
+#endif
+
         internal struct FieldElement
         {
             internal ulong n0, n1;
@@ -27,128 +36,53 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             x.n1 = 0UL;
         }
 
-        internal static byte[] OneAsBytes()
-        {
-            byte[] tmp = new byte[16];
-            tmp[0] = 0x80;
-            return tmp;
-        }
-
-        internal static uint[] OneAsUints()
-        {
-            uint[] tmp = new uint[4];
-            tmp[0] = 0x80000000;
-            return tmp;
-        }
-
-        internal static ulong[] OneAsUlongs()
-        {
-            ulong[] tmp = new ulong[2];
-            tmp[0] = 1UL << 63;
-            return tmp;
-        }
-
-        internal static byte[] AsBytes(uint[] x)
-        {
-            return Pack.UInt32_To_BE(x);
-        }
-
-        internal static void AsBytes(uint[] x, byte[] z)
-        {
-            Pack.UInt32_To_BE(x, z, 0);
-        }
-
-        internal static byte[] AsBytes(ulong[] x)
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+#endif
+        internal static void AsBytes(ulong x0, ulong x1, byte[] z)
         {
-            byte[] z = new byte[16];
-            Pack.UInt64_To_BE(x, z, 0);
-            return z;
-        }
+#if NETCOREAPP3_0_OR_GREATER
+            if (Ssse3.IsSupported && BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+            {
+                var X = Vector128.Create(x0, x1).AsByte();
+                var Z = Ssse3.Shuffle(X, EndianMask);
+                Unsafe.WriteUnaligned(ref z[0], Z);
+                return;
+            }
+#endif
 
-        internal static void AsBytes(ulong[] x, byte[] z)
-        {
-            Pack.UInt64_To_BE(x, z, 0);
+            Pack.UInt64_To_BE(x0, z, 0);
+            Pack.UInt64_To_BE(x1, z, 8);
         }
 
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+#endif
         internal static void AsBytes(ref FieldElement x, byte[] z)
         {
-            Pack.UInt64_To_BE(x.n0, z, 0);
-            Pack.UInt64_To_BE(x.n1, z, 8);
+            AsBytes(x.n0, x.n1, z);
         }
 
+#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+#endif
         internal static void AsFieldElement(byte[] x, out FieldElement z)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Ssse3.IsSupported && BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<byte>>() == 16)
+            {
+                var X = Unsafe.ReadUnaligned<Vector128<byte>>(ref x[0]);
+                var Z = Ssse3.Shuffle(X, EndianMask).AsUInt64();
+                z.n0 = Z.GetElement(0);
+                z.n1 = Z.GetElement(1);
+                return;
+            }
+#endif
+
             z.n0 = Pack.BE_To_UInt64(x, 0);
             z.n1 = Pack.BE_To_UInt64(x, 8);
         }
 
-        internal static uint[] AsUints(byte[] bs)
-        {
-            uint[] output = new uint[4];
-            Pack.BE_To_UInt32(bs, 0, output);
-            return output;
-        }
-
-        internal static void AsUints(byte[] bs, uint[] output)
-        {
-            Pack.BE_To_UInt32(bs, 0, output);
-        }
-
-        internal static ulong[] AsUlongs(byte[] x)
-        {
-            ulong[] z = new ulong[2];
-            Pack.BE_To_UInt64(x, 0, z);
-            return z;
-        }
-
-        internal static void AsUlongs(byte[] x, ulong[] z)
-        {
-            Pack.BE_To_UInt64(x, 0, z);
-        }
-
-        internal static void AsUlongs(byte[] x, ulong[] z, int zOff)
-        {
-            Pack.BE_To_UInt64(x, 0, z, zOff, 2);
-        }
-
-        internal static void Copy(uint[] x, uint[] z)
-        {
-            z[0] = x[0];
-            z[1] = x[1];
-            z[2] = x[2];
-            z[3] = x[3];
-        }
-
-        internal static void Copy(ulong[] x, ulong[] z)
-        {
-            z[0] = x[0];
-            z[1] = x[1];
-        }
-
-        internal static void Copy(ulong[] x, int xOff, ulong[] z, int zOff)
-        {
-            z[zOff + 0] = x[xOff + 0];
-            z[zOff + 1] = x[xOff + 1];
-        }
-
-        internal static void DivideP(ulong[] x, ulong[] z)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong m = (ulong)((long)x0 >> 63);
-            x0 ^= (m & E1UL);
-            z[0] = (x0 << 1) | (x1 >> 63);
-            z[1] = (x1 << 1) | (ulong)(-(long)m);
-        }
-
-        internal static void DivideP(ulong[] x, int xOff, ulong[] z, int zOff)
-        {
-            ulong x0 = x[xOff + 0], x1 = x[xOff + 1];
-            ulong m = (ulong)((long)x0 >> 63);
-            x0 ^= (m & E1UL);
-            z[zOff + 0] = (x0 << 1) | (x1 >> 63);
-            z[zOff + 1] = (x1 << 1) | (ulong)(-(long)m);
-        }
-
         internal static void DivideP(ref FieldElement x, out FieldElement z)
         {
             ulong x0 = x.n0, x1 = x.n1;
@@ -233,90 +167,6 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             x.n1 = z1;
         }
 
-        internal static void MultiplyP(uint[] x)
-        {
-            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
-            uint m = (uint)((int)(x3 << 31) >> 31);
-            x[0] = (x0 >> 1) ^ (m & E1);
-            x[1] = (x1 >> 1) | (x0 << 31);
-            x[2] = (x2 >> 1) | (x1 << 31);
-            x[3] = (x3 >> 1) | (x2 << 31);
-        }
-
-        internal static void MultiplyP(uint[] x, uint[] z)
-        {
-            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
-            uint m = (uint)((int)(x3 << 31) >> 31);
-            z[0] = (x0 >> 1) ^ (m & E1);
-            z[1] = (x1 >> 1) | (x0 << 31);
-            z[2] = (x2 >> 1) | (x1 << 31);
-            z[3] = (x3 >> 1) | (x2 << 31);
-        }
-
-        internal static void MultiplyP(ulong[] x)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong m = (ulong)((long)(x1 << 63) >> 63);
-            x[0] = (x0 >> 1) ^ (m & E1UL);
-            x[1] = (x1 >> 1) | (x0 << 63);
-        }
-
-        internal static void MultiplyP(ulong[] x, ulong[] z)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong m = (ulong)((long)(x1 << 63) >> 63);
-            z[0] = (x0 >> 1) ^ (m & E1UL);
-            z[1] = (x1 >> 1) | (x0 << 63);
-        }
-
-        internal static void MultiplyP3(ulong[] x, ulong[] z)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 61;
-            z[0] = (x0 >> 3) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[1] = (x1 >> 3) | (x0 << 61);
-        }
-
-        internal static void MultiplyP3(ulong[] x, int xOff, ulong[] z, int zOff)
-        {
-            ulong x0 = x[xOff + 0], x1 = x[xOff + 1];
-            ulong c = x1 << 61;
-            z[zOff + 0] = (x0 >> 3) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[zOff + 1] = (x1 >> 3) | (x0 << 61);
-        }
-
-        internal static void MultiplyP4(ulong[] x, ulong[] z)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 60;
-            z[0] = (x0 >> 4) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[1] = (x1 >> 4) | (x0 << 60);
-        }
-
-        internal static void MultiplyP4(ulong[] x, int xOff, ulong[] z, int zOff)
-        {
-            ulong x0 = x[xOff + 0], x1 = x[xOff + 1];
-            ulong c = x1 << 60;
-            z[zOff + 0] = (x0 >> 4) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[zOff + 1] = (x1 >> 4) | (x0 << 60);
-        }
-
-        internal static void MultiplyP7(ulong[] x, ulong[] z)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 57;
-            z[0] = (x0 >> 7) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[1] = (x1 >> 7) | (x0 << 57);
-        }
-
-        internal static void MultiplyP7(ulong[] x, int xOff, ulong[] z, int zOff)
-        {
-            ulong x0 = x[xOff + 0], x1 = x[xOff + 1];
-            ulong c = x1 << 57;
-            z[zOff + 0] = (x0 >> 7) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            z[zOff + 1] = (x1 >> 7) | (x0 << 57);
-        }
-
         internal static void MultiplyP7(ref FieldElement x)
         {
             ulong x0 = x.n0, x1 = x.n1;
@@ -325,50 +175,6 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             x.n1 = (x1 >> 7) | (x0 << 57);
         }
 
-        internal static void MultiplyP8(uint[] x)
-        {
-            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
-            uint c = x3 << 24;
-            x[0] = (x0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            x[1] = (x1 >> 8) | (x0 << 24);
-            x[2] = (x2 >> 8) | (x1 << 24);
-            x[3] = (x3 >> 8) | (x2 << 24);
-        }
-
-        internal static void MultiplyP8(uint[] x, uint[] y)
-        {
-            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
-            uint c = x3 << 24;
-            y[0] = (x0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            y[1] = (x1 >> 8) | (x0 << 24);
-            y[2] = (x2 >> 8) | (x1 << 24);
-            y[3] = (x3 >> 8) | (x2 << 24);
-        }
-
-        internal static void MultiplyP8(ulong[] x)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 56;
-            x[0] = (x0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            x[1] = (x1 >> 8) | (x0 << 56);
-        }
-
-        internal static void MultiplyP8(ulong[] x, ulong[] y)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 56;
-            y[0] = (x0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            y[1] = (x1 >> 8) | (x0 << 56);
-        }
-
-        internal static void MultiplyP8(ulong[] x, int xOff, ulong[] y, int yOff)
-        {
-            ulong x0 = x[xOff + 0], x1 = x[xOff + 1];
-            ulong c = x1 << 56;
-            y[yOff + 0] = (x0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            y[yOff + 1] = (x1 >> 8) | (x0 << 56);
-        }
-
         internal static void MultiplyP8(ref FieldElement x)
         {
             ulong x0 = x.n0, x1 = x.n1;
@@ -385,14 +191,6 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             y.n1 = (x1 >> 8) | (x0 << 56);
         }
 
-        internal static void MultiplyP16(ulong[] x)
-        {
-            ulong x0 = x[0], x1 = x[1];
-            ulong c = x1 << 48;
-            x[0] = (x0 >> 16) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
-            x[1] = (x1 >> 16) | (x0 << 48);
-        }
-
         internal static void MultiplyP16(ref FieldElement x)
         {
             ulong x0 = x.n0, x1 = x.n1;
@@ -448,19 +246,6 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             while (i < 16);
         }
 
-        internal static void Xor(byte[] x, int xOff, byte[] y, int yOff, byte[] z, int zOff)
-        {
-            int i = 0;
-            do
-            {
-                z[zOff + i] = (byte)(x[xOff + i] ^ y[yOff + i]); ++i;
-                z[zOff + i] = (byte)(x[xOff + i] ^ y[yOff + i]); ++i;
-                z[zOff + i] = (byte)(x[xOff + i] ^ y[yOff + i]); ++i;
-                z[zOff + i] = (byte)(x[xOff + i] ^ y[yOff + i]); ++i;
-            }
-            while (i < 16);
-        }
-
         internal static void Xor(byte[] x, byte[] y, int yOff, int yLen)
         {
             while (--yLen >= 0)
@@ -477,57 +262,10 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             }
         }
 
-        internal static void Xor(byte[] x, byte[] y, byte[] z)
-        {
-            int i = 0;
-            do
-            {
-                z[i] = (byte)(x[i] ^ y[i]); ++i;
-                z[i] = (byte)(x[i] ^ y[i]); ++i;
-                z[i] = (byte)(x[i] ^ y[i]); ++i;
-                z[i] = (byte)(x[i] ^ y[i]); ++i;
-            }
-            while (i < 16);
-        }
-
-        internal static void Xor(uint[] x, uint[] y)
-        {
-            x[0] ^= y[0];
-            x[1] ^= y[1];
-            x[2] ^= y[2];
-            x[3] ^= y[3];
-        }
-
-        internal static void Xor(uint[] x, uint[] y, uint[] z)
-        {
-            z[0] = x[0] ^ y[0];
-            z[1] = x[1] ^ y[1];
-            z[2] = x[2] ^ y[2];
-            z[3] = x[3] ^ y[3];
-        }
-
-        internal static void Xor(ulong[] x, ulong[] y)
-        {
-            x[0] ^= y[0];
-            x[1] ^= y[1];
-        }
-
-        internal static void Xor(ulong[] x, int xOff, ulong[] y, int yOff)
-        {
-            x[xOff + 0] ^= y[yOff + 0];
-            x[xOff + 1] ^= y[yOff + 1];
-        }
-
-        internal static void Xor(ulong[] x, ulong[] y, ulong[] z)
-        {
-            z[0] = x[0] ^ y[0];
-            z[1] = x[1] ^ y[1];
-        }
-
-        internal static void Xor(ulong[] x, int xOff, ulong[] y, int yOff, ulong[] z, int zOff)
+        internal static void Xor(ref FieldElement x, ref FieldElement y)
         {
-            z[zOff + 0] = x[xOff + 0] ^ y[yOff + 0];
-            z[zOff + 1] = x[xOff + 1] ^ y[yOff + 1];
+            x.n0 ^= y.n0;
+            x.n1 ^= y.n1;
         }
 
         internal static void Xor(ref FieldElement x, ref FieldElement y, out FieldElement z)
@@ -536,12 +274,6 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
             z.n1 = x.n1 ^ y.n1;
         }
 
-        internal static void Xor(ref FieldElement x, ref FieldElement y)
-        {
-            x.n0 ^= y.n0;
-            x.n1 ^= y.n1;
-        }
-
         private static ulong ImplMul64(ulong x, ulong y)
         {
             ulong x0 = x & 0x1111111111111111UL;
diff --git a/crypto/src/crypto/modes/gcm/Tables4kGcmMultiplier.cs b/crypto/src/crypto/modes/gcm/Tables4kGcmMultiplier.cs
index 7867a0b99..117558b45 100644
--- a/crypto/src/crypto/modes/gcm/Tables4kGcmMultiplier.cs
+++ b/crypto/src/crypto/modes/gcm/Tables4kGcmMultiplier.cs
@@ -62,8 +62,7 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
                 z0 = T[pos].n0 ^ (z0 >> 8) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
             }
 
-            Pack.UInt64_To_BE(z0, x, 0);
-            Pack.UInt64_To_BE(z1, x, 8);
+            GcmUtilities.AsBytes(z0, z1, x);
         }
     }
 }
diff --git a/crypto/src/crypto/modes/gcm/Tables64kGcmMultiplier.cs b/crypto/src/crypto/modes/gcm/Tables64kGcmMultiplier.cs
index 364c070e7..52a9d4e82 100644
--- a/crypto/src/crypto/modes/gcm/Tables64kGcmMultiplier.cs
+++ b/crypto/src/crypto/modes/gcm/Tables64kGcmMultiplier.cs
@@ -74,8 +74,7 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
                 z1 ^= t[tPos].n1;
             }
 
-            Pack.UInt64_To_BE(z0, x, 0);
-            Pack.UInt64_To_BE(z1, x, 8);
+            GcmUtilities.AsBytes(z0, z1, x);
         }
     }
 }
diff --git a/crypto/src/crypto/modes/gcm/Tables8kGcmMultiplier.cs b/crypto/src/crypto/modes/gcm/Tables8kGcmMultiplier.cs
index 67a709a75..7fe122526 100644
--- a/crypto/src/crypto/modes/gcm/Tables8kGcmMultiplier.cs
+++ b/crypto/src/crypto/modes/gcm/Tables8kGcmMultiplier.cs
@@ -82,8 +82,7 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm
                 z0 = T0[uPos].n0 ^ T1[vPos].n0 ^ (z0 >> 16) ^ c ^ (c >> 1) ^ (c >> 2) ^ (c >> 7);
             }
 
-            Pack.UInt64_To_BE(z0, x, 0);
-            Pack.UInt64_To_BE(z1, x, 8);
+            GcmUtilities.AsBytes(z0, z1, x);
         }
     }
 }