From 072f410f8c06ed0b7482f259197b13bf9bcdf649 Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sun, 13 Nov 2022 15:20:06 +0700 Subject: X448 perf. opts. --- crypto/src/math/ec/rfc7748/X448Field.cs | 181 ++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/crypto/src/math/ec/rfc7748/X448Field.cs b/crypto/src/math/ec/rfc7748/X448Field.cs index 1e1d379fe..67e71afa8 100644 --- a/crypto/src/math/ec/rfc7748/X448Field.cs +++ b/crypto/src/math/ec/rfc7748/X448Field.cs @@ -1,5 +1,16 @@ using System; using System.Diagnostics; +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER +using System.Runtime.CompilerServices; +#endif +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System.Numerics; +#endif +#if NETCOREAPP3_0_OR_GREATER +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Math.Raw; @@ -16,14 +27,47 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFEU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU }; +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] +#endif public static void Add(uint[] x, uint[] y, uint[] z) { +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + Add(x.AsSpan(), y.AsSpan(), z.AsSpan()); +#else for (int i = 0; i < Size; ++i) { z[i] = x[i] + y[i]; } +#endif } +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Add(ReadOnlySpan x, ReadOnlySpan y, Span z) + { + int i = 0; + if (Vector.IsHardwareAccelerated) + { + int limit = Size - Vector.Count; + while (i <= limit) + { + var vx = new Vector(x[i..]); + var vy = new Vector(y[i..]); + (vx + vy).CopyTo(z[i..]); + i += Vector.Count; + } + } + { + while (i < Size) + { + z[i] = x[i] + y[i]; + ++i; + } + } + } +#endif + public static void AddOne(uint[] z) { z[0] += 1; @@ -140,10 +184,14 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748 public static void Copy(uint[] x, int xOff, uint[] z, int zOff) { +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + Copy(x.AsSpan(xOff), z.AsSpan(zOff)); +#else for (int i = 0; i < Size; ++i) { z[zOff + i] = x[xOff + i]; } +#endif } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER @@ -1289,6 +1337,9 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748 public static void Sub(uint[] x, uint[] y, uint[] z) { +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + Sub(x.AsSpan(), y.AsSpan(), z.AsSpan()); +#else uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7]; uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15]; uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5], y6 = y[6], y7 = y[7]; @@ -1350,8 +1401,138 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748 z[13] = z13; z[14] = z14; z[15] = z15; +#endif } +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + public static void Sub(ReadOnlySpan x, ReadOnlySpan y, Span z) + { +#if NETCOREAPP3_0_OR_GREATER + if (Avx2.IsSupported && BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 32) + { + var ControlCarry = Vector256.Create(7U, 0U, 1U, 2U, 3U, 4U, 5U, 6U); + var Mask28 = Vector256.Create(M28); + var MaskScalar = Vector256.CreateScalar(uint.MaxValue); + + var P0 = Vector256.Create(0x1FFFFFFEU); + var P1 = P0.WithElement(0, 0x1FFFFFFCU); + + var X = MemoryMarshal.AsBytes(x[..16]); + var Y = MemoryMarshal.AsBytes(y[..16]); + var Z = MemoryMarshal.AsBytes(z[..16]); + + var X0 = MemoryMarshal.Read>(X[0x00..0x20]); + var X1 = MemoryMarshal.Read>(X[0x20..0x40]); + + var Y0 = MemoryMarshal.Read>(Y[0x00..0x20]); + var Y1 = MemoryMarshal.Read>(Y[0x20..0x40]); + + var Z0 = Avx2.Subtract(Avx2.Add(X0, P0), Y0); + var Z1 = Avx2.Subtract(Avx2.Add(X1, P1), Y1); + + // First carry + { + var C0 = Avx2.ShiftRightLogical(Z0, 28); + var C1 = Avx2.ShiftRightLogical(Z1, 28); + + Z0 = Avx2.And(Z0, Mask28); + Z1 = Avx2.And(Z1, Mask28); + + C0 = Avx2.PermuteVar8x32(C0, ControlCarry); + C1 = Avx2.PermuteVar8x32(C1, ControlCarry); + + Z0 = Avx2.Add(Z0, Avx2.Blend(C0, C1, 0x01)); + Z1 = Avx2.Add(Z1, C1); + Z1 = Avx2.Add(Z1, Avx2.And(C0, MaskScalar)); + } + + // Second carry + { + var C0 = Avx2.ShiftRightLogical(Z0, 28); + var C1 = Avx2.ShiftRightLogical(Z1, 28); + + Z0 = Avx2.And(Z0, Mask28); + Z1 = Avx2.And(Z1, Mask28); + + C0 = Avx2.PermuteVar8x32(C0, ControlCarry); + C1 = Avx2.PermuteVar8x32(C1, ControlCarry); + + Z0 = Avx2.Add(Z0, Avx2.Blend(C0, C1, 0x01)); + Z1 = Avx2.Add(Z1, C1); + Z1 = Avx2.Add(Z1, Avx2.And(C0, MaskScalar)); + } + + MemoryMarshal.Write(Z[0x00..0x20], ref Z0); + MemoryMarshal.Write(Z[0x20..0x40], ref Z1); + + return; + } +#endif + + uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7]; + uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15]; + uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5], y6 = y[6], y7 = y[7]; + uint y8 = y[8], y9 = y[9], y10 = y[10], y11 = y[11], y12 = y[12], y13 = y[13], y14 = y[14], y15 = y[15]; + + uint z0 = x0 + 0x1FFFFFFEU - y0; + uint z1 = x1 + 0x1FFFFFFEU - y1; + uint z2 = x2 + 0x1FFFFFFEU - y2; + uint z3 = x3 + 0x1FFFFFFEU - y3; + uint z4 = x4 + 0x1FFFFFFEU - y4; + uint z5 = x5 + 0x1FFFFFFEU - y5; + uint z6 = x6 + 0x1FFFFFFEU - y6; + uint z7 = x7 + 0x1FFFFFFEU - y7; + uint z8 = x8 + 0x1FFFFFFCU - y8; + uint z9 = x9 + 0x1FFFFFFEU - y9; + uint z10 = x10 + 0x1FFFFFFEU - y10; + uint z11 = x11 + 0x1FFFFFFEU - y11; + uint z12 = x12 + 0x1FFFFFFEU - y12; + uint z13 = x13 + 0x1FFFFFFEU - y13; + uint z14 = x14 + 0x1FFFFFFEU - y14; + uint z15 = x15 + 0x1FFFFFFEU - y15; + + z2 += z1 >> 28; z1 &= M28; + z6 += z5 >> 28; z5 &= M28; + z10 += z9 >> 28; z9 &= M28; + z14 += z13 >> 28; z13 &= M28; + + z3 += z2 >> 28; z2 &= M28; + z7 += z6 >> 28; z6 &= M28; + z11 += z10 >> 28; z10 &= M28; + z15 += z14 >> 28; z14 &= M28; + + uint t = z15 >> 28; z15 &= M28; + z0 += t; + z8 += t; + + z4 += z3 >> 28; z3 &= M28; + z8 += z7 >> 28; z7 &= M28; + z12 += z11 >> 28; z11 &= M28; + + z1 += z0 >> 28; z0 &= M28; + z5 += z4 >> 28; z4 &= M28; + z9 += z8 >> 28; z8 &= M28; + z13 += z12 >> 28; z12 &= M28; + + z[0] = z0; + z[1] = z1; + z[2] = z2; + z[3] = z3; + z[4] = z4; + z[5] = z5; + z[6] = z6; + z[7] = z7; + z[8] = z8; + z[9] = z9; + z[10] = z10; + z[11] = z11; + z[12] = z12; + z[13] = z13; + z[14] = z14; + z[15] = z15; + } +#endif + public static void SubOne(uint[] z) { uint[] one = Create(); -- cgit 1.4.1