From 57e97b2bec7929e204a49d0b384e04f07abf0da2 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Tue, 19 Jul 2022 00:44:58 +0700 Subject: CLMUL for GCM when available --- crypto/src/crypto/engines/AesX86Engine.cs | 2 +- crypto/src/crypto/modes/GCMBlockCipher.cs | 14 +- crypto/src/crypto/modes/gcm/BasicGcmMultiplier.cs | 3 +- crypto/src/crypto/modes/gcm/GcmUtilities.cs | 174 +++++++--------------- 4 files changed, 72 insertions(+), 121 deletions(-) diff --git a/crypto/src/crypto/engines/AesX86Engine.cs b/crypto/src/crypto/engines/AesX86Engine.cs index 038704a5f..b07e27bcf 100644 --- a/crypto/src/crypto/engines/AesX86Engine.cs +++ b/crypto/src/crypto/engines/AesX86Engine.cs @@ -14,7 +14,7 @@ namespace Org.BouncyCastle.Crypto.Engines public class AesX86Engine : IBlockCipher { - public static bool IsSupported => Aes.IsSupported && Sse2.IsSupported; + public static bool IsSupported => Aes.IsSupported; private static Vector128[] CreateRoundKeys(byte[] key, bool forEncryption) { diff --git a/crypto/src/crypto/modes/GCMBlockCipher.cs b/crypto/src/crypto/modes/GCMBlockCipher.cs index 88b413fa2..b2723e004 100644 --- a/crypto/src/crypto/modes/GCMBlockCipher.cs +++ b/crypto/src/crypto/modes/GCMBlockCipher.cs @@ -15,6 +15,18 @@ namespace Org.BouncyCastle.Crypto.Modes public class GcmBlockCipher : IAeadBlockCipher { + private static IGcmMultiplier CreateGcmMultiplier() + { +#if NET5_0_OR_GREATER + if (System.Runtime.Intrinsics.X86.Pclmulqdq.IsSupported) + { + return new BasicGcmMultiplier(); + } +#endif + + return new Tables4kGcmMultiplier(); + } + private const int BlockSize = 16; private readonly IBlockCipher cipher; @@ -59,7 +71,7 @@ namespace Org.BouncyCastle.Crypto.Modes if (m == null) { - m = new Tables4kGcmMultiplier(); + m = CreateGcmMultiplier(); } this.cipher = c; diff --git a/crypto/src/crypto/modes/gcm/BasicGcmMultiplier.cs b/crypto/src/crypto/modes/gcm/BasicGcmMultiplier.cs index c93318524..f36aaa8e4 100644 --- a/crypto/src/crypto/modes/gcm/BasicGcmMultiplier.cs +++ b/crypto/src/crypto/modes/gcm/BasicGcmMultiplier.cs @@ -14,8 +14,7 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm public void MultiplyH(byte[] x) { - GcmUtilities.FieldElement T; - GcmUtilities.AsFieldElement(x, out T); + GcmUtilities.AsFieldElement(x, out var T); GcmUtilities.Multiply(ref T, ref H); GcmUtilities.AsBytes(ref T, x); } diff --git a/crypto/src/crypto/modes/gcm/GcmUtilities.cs b/crypto/src/crypto/modes/gcm/GcmUtilities.cs index 3deed2fc1..c40d53195 100644 --- a/crypto/src/crypto/modes/gcm/GcmUtilities.cs +++ b/crypto/src/crypto/modes/gcm/GcmUtilities.cs @@ -1,4 +1,8 @@ using System; +#if NET5_0_OR_GREATER +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Math.Raw; @@ -155,129 +159,65 @@ namespace Org.BouncyCastle.Crypto.Modes.Gcm internal static void Multiply(byte[] x, byte[] y) { - ulong[] t1 = AsUlongs(x); - ulong[] t2 = AsUlongs(y); - Multiply(t1, t2); - AsBytes(t1, x); + AsFieldElement(x, out FieldElement X); + AsFieldElement(y, out FieldElement Y); + Multiply(ref X, ref Y); + AsBytes(ref X, x); } - internal static void Multiply(uint[] x, uint[] y) + internal static void Multiply(ref FieldElement x, ref FieldElement y) { - uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3]; - uint z0 = 0, z1 = 0, z2 = 0, z3 = 0; + ulong z0, z1, z2, z3; - for (int i = 0; i < 4; ++i) +#if NET5_0_OR_GREATER + if (Pclmulqdq.IsSupported) { - int bits = (int)x[i]; - for (int j = 0; j < 32; ++j) - { - uint m1 = (uint)(bits >> 31); bits <<= 1; - z0 ^= y0 & m1; - z1 ^= y1 & m1; - z2 ^= y2 & m1; - z3 ^= y3 & m1; - - uint m2 = (uint)((int)(y3 << 31) >> 8); - y3 = (y3 >> 1) | (y2 << 31); - y2 = (y2 >> 1) | (y1 << 31); - y1 = (y1 >> 1) | (y0 << 31); - y0 = (y0 >> 1) ^ (m2 & E1); - } + var X = Vector128.Create(x.n1, x.n0); + var Y = Vector128.Create(y.n1, y.n0); + + var Z0 = Pclmulqdq.CarrylessMultiply(X, Y, 0x00); + var Z1 = Sse2.Xor( + Pclmulqdq.CarrylessMultiply(X, Y, 0x01), + Pclmulqdq.CarrylessMultiply(X, Y, 0x10)); + var Z2 = Pclmulqdq.CarrylessMultiply(X, Y, 0x11); + + ulong t3 = Z0.GetElement(0); + ulong t2 = Z0.GetElement(1) ^ Z1.GetElement(0); + ulong t1 = Z2.GetElement(0) ^ Z1.GetElement(1); + ulong t0 = Z2.GetElement(1); + + z0 = (t0 << 1) | (t1 >> 63); + z1 = (t1 << 1) | (t2 >> 63); + z2 = (t2 << 1) | (t3 >> 63); + z3 = (t3 << 1); + } + else +#endif + { + /* + * "Three-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein. + * + * Without access to the high part of a 64x64 product x * y, we use a bit reversal to calculate it: + * rev(x) * rev(y) == rev((x * y) << 1) + */ + + ulong x0 = x.n0, x1 = x.n1; + ulong y0 = y.n0, y1 = y.n1; + ulong x0r = Longs.Reverse(x0), x1r = Longs.Reverse(x1); + ulong y0r = Longs.Reverse(y0), y1r = Longs.Reverse(y1); + + ulong h0 = Longs.Reverse(ImplMul64(x0r, y0r)); + ulong h1 = ImplMul64(x0, y0) << 1; + ulong h2 = Longs.Reverse(ImplMul64(x1r, y1r)); + ulong h3 = ImplMul64(x1, y1) << 1; + ulong h4 = Longs.Reverse(ImplMul64(x0r ^ x1r, y0r ^ y1r)); + ulong h5 = ImplMul64(x0 ^ x1, y0 ^ y1) << 1; + + z0 = h0; + z1 = h1 ^ h0 ^ h2 ^ h4; + z2 = h2 ^ h1 ^ h3 ^ h5; + z3 = h3; } - - x[0] = z0; - x[1] = z1; - x[2] = z2; - x[3] = z3; - } - - internal static void Multiply(ulong[] x, ulong[] y) - { - //ulong x0 = x[0], x1 = x[1]; - //ulong y0 = y[0], y1 = y[1]; - //ulong z0 = 0, z1 = 0, z2 = 0; - - //for (int j = 0; j < 64; ++j) - //{ - // ulong m0 = (ulong)((long)x0 >> 63); x0 <<= 1; - // z0 ^= y0 & m0; - // z1 ^= y1 & m0; - - // ulong m1 = (ulong)((long)x1 >> 63); x1 <<= 1; - // z1 ^= y0 & m1; - // z2 ^= y1 & m1; - - // ulong c = (ulong)((long)(y1 << 63) >> 8); - // y1 = (y1 >> 1) | (y0 << 63); - // y0 = (y0 >> 1) ^ (c & E1UL); - //} - - //z0 ^= z2 ^ (z2 >> 1) ^ (z2 >> 2) ^ (z2 >> 7); - //z1 ^= (z2 << 63) ^ (z2 << 62) ^ (z2 << 57); - - //x[0] = z0; - //x[1] = z1; - - /* - * "Three-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein. - * - * Without access to the high part of a 64x64 product x * y, we use a bit reversal to calculate it: - * rev(x) * rev(y) == rev((x * y) << 1) - */ - - ulong x0 = x[0], x1 = x[1]; - ulong y0 = y[0], y1 = y[1]; - ulong x0r = Longs.Reverse(x0), x1r = Longs.Reverse(x1); - ulong y0r = Longs.Reverse(y0), y1r = Longs.Reverse(y1); - - ulong h0 = Longs.Reverse(ImplMul64(x0r, y0r)); - ulong h1 = ImplMul64(x0, y0) << 1; - ulong h2 = Longs.Reverse(ImplMul64(x1r, y1r)); - ulong h3 = ImplMul64(x1, y1) << 1; - ulong h4 = Longs.Reverse(ImplMul64(x0r ^ x1r, y0r ^ y1r)); - ulong h5 = ImplMul64(x0 ^ x1, y0 ^ y1) << 1; - - ulong z0 = h0; - ulong z1 = h1 ^ h0 ^ h2 ^ h4; - ulong z2 = h2 ^ h1 ^ h3 ^ h5; - ulong z3 = h3; - - z1 ^= z3 ^ (z3 >> 1) ^ (z3 >> 2) ^ (z3 >> 7); -// z2 ^= (z3 << 63) ^ (z3 << 62) ^ (z3 << 57); - z2 ^= (z3 << 62) ^ (z3 << 57); - - z0 ^= z2 ^ (z2 >> 1) ^ (z2 >> 2) ^ (z2 >> 7); - z1 ^= (z2 << 63) ^ (z2 << 62) ^ (z2 << 57); - - x[0] = z0; - x[1] = z1; - } - - internal static void Multiply(ref FieldElement x, ref FieldElement y) - { - /* - * "Three-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein. - * - * Without access to the high part of a 64x64 product x * y, we use a bit reversal to calculate it: - * rev(x) * rev(y) == rev((x * y) << 1) - */ - - ulong x0 = x.n0, x1 = x.n1; - ulong y0 = y.n0, y1 = y.n1; - ulong x0r = Longs.Reverse(x0), x1r = Longs.Reverse(x1); - ulong y0r = Longs.Reverse(y0), y1r = Longs.Reverse(y1); - - ulong h0 = Longs.Reverse(ImplMul64(x0r, y0r)); - ulong h1 = ImplMul64(x0, y0) << 1; - ulong h2 = Longs.Reverse(ImplMul64(x1r, y1r)); - ulong h3 = ImplMul64(x1, y1) << 1; - ulong h4 = Longs.Reverse(ImplMul64(x0r ^ x1r, y0r ^ y1r)); - ulong h5 = ImplMul64(x0 ^ x1, y0 ^ y1) << 1; - - ulong z0 = h0; - ulong z1 = h1 ^ h0 ^ h2 ^ h4; - ulong z2 = h2 ^ h1 ^ h3 ^ h5; - ulong z3 = h3; z1 ^= z3 ^ (z3 >> 1) ^ (z3 >> 2) ^ (z3 >> 7); // z2 ^= (z3 << 63) ^ (z3 << 62) ^ (z3 << 57); -- cgit 1.4.1