From 246d3914335e038c29fee5ac443fcf6912b470d7 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Wed, 7 Jun 2023 12:14:55 +0700 Subject: BIKE perf. opts. --- crypto/src/pqc/crypto/bike/BikeRing.cs | 122 ++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 23 deletions(-) diff --git a/crypto/src/pqc/crypto/bike/BikeRing.cs b/crypto/src/pqc/crypto/bike/BikeRing.cs index f7833b167..1fec7b89d 100644 --- a/crypto/src/pqc/crypto/bike/BikeRing.cs +++ b/crypto/src/pqc/crypto/bike/BikeRing.cs @@ -77,14 +77,7 @@ namespace Org.BouncyCastle.Pqc.Crypto.Bike int partialBits = m_bits & 63; int partialBytes = (partialBits + 7) >> 3; Pack.LE_To_UInt64(bs, 0, z, 0, Size - 1); -#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - Span last = stackalloc byte[8]; - bs.AsSpan((Size - 1) << 3, partialBytes).CopyTo(last); -#else - byte[] last = new byte[8]; - Array.Copy(bs, (Size - 1) << 3, last, 0, partialBytes); -#endif - z[Size - 1] = Pack.LE_To_UInt64(last); + z[Size - 1] = Pack.LE_To_UInt64_Low(bs, (Size - 1) << 3, partialBytes); Debug.Assert((z[Size - 1] >> partialBits) == 0UL); } @@ -105,19 +98,17 @@ namespace Org.BouncyCastle.Pqc.Crypto.Bike int partialBytes = (partialBits + 7) >> 3; Debug.Assert((x[Size - 1] >> partialBits) == 0UL); Pack.UInt64_To_LE(x, 0, Size - 1, bs, 0); -#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - Span last = stackalloc byte[8]; - Pack.UInt64_To_LE(x[Size - 1], last); - last[..partialBytes].CopyTo(bs.AsSpan((Size - 1) << 3)); -#else - byte[] last = new byte[8]; - Pack.UInt64_To_LE(x[Size - 1], last); - Array.Copy(last, 0, bs, (Size - 1) << 3, partialBytes); -#endif + Pack.UInt64_To_LE_Low(x[Size - 1], bs, (Size - 1) << 3, partialBytes); } internal void Inv(ulong[] a, ulong[] z) { + /* + * Algorithm for inversion is based on https://ia.cr/2020/298 (Nir Drucker, Shay Gueron, Dusan Kostic, + * "Fast polynomial inversion for post quantum QC-MDPC cryptography"), in particular replacing large + * squarings with permutations. However we precompute only powers-of-half instead of full tables. + */ + ulong[] f = Create(); ulong[] g = Create(); ulong[] t = Create(); @@ -224,31 +215,116 @@ namespace Org.BouncyCastle.Pqc.Crypto.Bike #if NETCOREAPP3_0_OR_GREATER if (Pclmulqdq.IsSupported) { - int i = 0, limit = Size - 2; - while (i <= limit) + int i = 0; + + int limit4 = Size - 4; + while (i <= limit4) { - var X01 = Vector128.Create(x[xOff + i], x[xOff + i + 1]); + var X01 = Vector128.Create(x[xOff + i + 0], x[xOff + i + 1]); + var X23 = Vector128.Create(x[xOff + i + 2], x[xOff + i + 3]); int j = 0; - while (j <= limit) + while (j <= limit4) { - var Y01 = Vector128.Create(y[yOff + j], y[yOff + j + 1]); + var Y01 = Vector128.Create(y[yOff + j + 0], y[yOff + j + 1]); + var Y23 = Vector128.Create(y[yOff + j + 2], y[yOff + j + 3]); var Z01 = Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00); var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01), Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10)); var Z23 = Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11); + var T23 = Pclmulqdq.CarrylessMultiply(X01, Y23, 0x00); + var T34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y23, 0x01), + Pclmulqdq.CarrylessMultiply(X01, Y23, 0x10)); + var T45 = Pclmulqdq.CarrylessMultiply(X01, Y23, 0x11); + + var U23 = Pclmulqdq.CarrylessMultiply(X23, Y01, 0x00); + var U34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y01, 0x01), + Pclmulqdq.CarrylessMultiply(X23, Y01, 0x10)); + var U45 = Pclmulqdq.CarrylessMultiply(X23, Y01, 0x11); + + var Z45 = Pclmulqdq.CarrylessMultiply(X23, Y23, 0x00); + var Z56 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x01), + Pclmulqdq.CarrylessMultiply(X23, Y23, 0x10)); + var Z67 = Pclmulqdq.CarrylessMultiply(X23, Y23, 0x11); + + Z23 = Sse2.Xor(Z23, T23); + Z23 = Sse2.Xor(Z23, U23); + var Z34 = Sse2.Xor(T34, U34); + Z45 = Sse2.Xor(Z45, T45); + Z45 = Sse2.Xor(Z45, U45); + + Z01 = Sse2.Xor(Z01, Sse2.ShiftLeftLogical128BitLane (Z12, 8)); + Z23 = Sse2.Xor(Z23, Sse2.ShiftRightLogical128BitLane(Z12, 8)); + + Z23 = Sse2.Xor(Z23, Sse2.ShiftLeftLogical128BitLane (Z34, 8)); + Z45 = Sse2.Xor(Z45, Sse2.ShiftRightLogical128BitLane(Z34, 8)); + + Z45 = Sse2.Xor(Z45, Sse2.ShiftLeftLogical128BitLane (Z56, 8)); + Z67 = Sse2.Xor(Z67, Sse2.ShiftRightLogical128BitLane(Z56, 8)); + + zz[i + j + 0] ^= Z01.GetElement(0); + zz[i + j + 1] ^= Z01.GetElement(1); + zz[i + j + 2] ^= Z23.GetElement(0); + zz[i + j + 3] ^= Z23.GetElement(1); + zz[i + j + 4] ^= Z45.GetElement(0); + zz[i + j + 5] ^= Z45.GetElement(1); + zz[i + j + 6] ^= Z67.GetElement(0); + zz[i + j + 7] ^= Z67.GetElement(1); + + j += 4; + } + + i += 4; + } + + int limit2 = Size - 2; + if (i <= limit2) + { + var Xi = Vector128.Create(x[xOff + i], x[xOff + i + 1]); + var Yi = Vector128.Create(y[yOff + i], y[yOff + i + 1]); + + for (int j = 0; j < i; j += 2) + { + var Xj = Vector128.Create(x[xOff + j], x[xOff + j + 1]); + var Yj = Vector128.Create(y[yOff + j], y[yOff + j + 1]); + + var U01 = Pclmulqdq.CarrylessMultiply(Xi, Yj, 0x00); + var U12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(Xi, Yj, 0x01), + Pclmulqdq.CarrylessMultiply(Xi, Yj, 0x10)); + var U23 = Pclmulqdq.CarrylessMultiply(Xi, Yj, 0x11); + + var V01 = Pclmulqdq.CarrylessMultiply(Xj, Yi, 0x00); + var V12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(Xj, Yi, 0x01), + Pclmulqdq.CarrylessMultiply(Xj, Yi, 0x10)); + var V23 = Pclmulqdq.CarrylessMultiply(Xj, Yi, 0x11); + + var Z01 = Sse2.Xor(U01, V01); + var Z12 = Sse2.Xor(U12, V12); + var Z23 = Sse2.Xor(U23, V23); + zz[i + j + 0] ^= Z01.GetElement(0); zz[i + j + 1] ^= Z01.GetElement(1) ^ Z12.GetElement(0); zz[i + j + 2] ^= Z23.GetElement(0) ^ Z12.GetElement(1); zz[i + j + 3] ^= Z23.GetElement(1); + } - j += 2; + { + var Z01 = Pclmulqdq.CarrylessMultiply(Xi, Yi, 0x00); + var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(Xi, Yi, 0x01), + Pclmulqdq.CarrylessMultiply(Xi, Yi, 0x10)); + var Z23 = Pclmulqdq.CarrylessMultiply(Xi, Yi, 0x11); + + zz[i + i + 0] ^= Z01.GetElement(0); + zz[i + i + 1] ^= Z01.GetElement(1) ^ Z12.GetElement(0); + zz[i + i + 2] ^= Z23.GetElement(0) ^ Z12.GetElement(1); + zz[i + i + 3] ^= Z23.GetElement(1); } i += 2; } + if (i < Size) { var Xi = Vector128.CreateScalar(x[xOff + i]); -- cgit 1.4.1