diff options
-rw-r--r-- | crypto/src/math/ec/rfc8032/Ed25519.cs | 14 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc8032/Ed448.cs | 14 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc8032/Scalar25519.cs | 309 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc8032/Scalar448.cs | 439 |
4 files changed, 704 insertions, 72 deletions
diff --git a/crypto/src/math/ec/rfc8032/Ed25519.cs b/crypto/src/math/ec/rfc8032/Ed25519.cs index 766ccb393..8add3c48b 100644 --- a/crypto/src/math/ec/rfc8032/Ed25519.cs +++ b/crypto/src/math/ec/rfc8032/Ed25519.cs @@ -149,7 +149,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 byte[] result = new byte[ScalarBytes * 2]; Codec.Encode32(t, 0, t.Length, result, 0); - return Scalar25519.Reduce(result); + return Scalar25519.Reduce512(result); } private static bool CheckContextVar(byte[] ctx, byte phflag) @@ -592,7 +592,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.DoFinal(h, 0); - byte[] r = Scalar25519.Reduce(h); + byte[] r = Scalar25519.Reduce512(h); byte[] R = new byte[PointBytes]; ScalarMultBaseEncoded(r, R, 0); @@ -605,7 +605,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.DoFinal(h, 0); - byte[] k = Scalar25519.Reduce(h); + byte[] k = Scalar25519.Reduce512(h); byte[] S = CalculateS(r, k, s); Array.Copy(R, 0, sig, sigOff, PointBytes); @@ -697,7 +697,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.DoFinal(h); Span<byte> k = stackalloc byte[ScalarBytes]; - Scalar25519.Reduce(h, k); + Scalar25519.Reduce512(h, k); Span<uint> nA = stackalloc uint[ScalarUints]; Scalar25519.Decode(k, nA); @@ -739,7 +739,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.DoFinal(h, 0); - byte[] k = Scalar25519.Reduce(h); + byte[] k = Scalar25519.Reduce512(h); uint[] nA = new uint[ScalarUints]; Scalar25519.Decode(k, nA); @@ -799,7 +799,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.DoFinal(h); Span<byte> k = stackalloc byte[ScalarBytes]; - Scalar25519.Reduce(h, k); + Scalar25519.Reduce512(h, k); Span<uint> nA = stackalloc uint[ScalarUints]; Scalar25519.Decode(k, nA); @@ -840,7 +840,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.DoFinal(h, 0); - byte[] k = Scalar25519.Reduce(h); + byte[] k = Scalar25519.Reduce512(h); uint[] nA = new uint[ScalarUints]; Scalar25519.Decode(k, nA); diff --git a/crypto/src/math/ec/rfc8032/Ed448.cs b/crypto/src/math/ec/rfc8032/Ed448.cs index bde1461e6..6eee639fe 100644 --- a/crypto/src/math/ec/rfc8032/Ed448.cs +++ b/crypto/src/math/ec/rfc8032/Ed448.cs @@ -118,7 +118,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 byte[] result = new byte[ScalarBytes * 2]; Codec.Encode32(t, 0, t.Length, result, 0); - return Scalar448.Reduce(result); + return Scalar448.Reduce912(result); } private static bool CheckContextVar(byte[] ctx) @@ -545,7 +545,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.OutputFinal(h, 0, h.Length); - byte[] r = Scalar448.Reduce(h); + byte[] r = Scalar448.Reduce912(h); byte[] R = new byte[PointBytes]; ScalarMultBaseEncoded(r, R, 0); @@ -555,7 +555,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.OutputFinal(h, 0, h.Length); - byte[] k = Scalar448.Reduce(h); + byte[] k = Scalar448.Reduce912(h); byte[] S = CalculateS(r, k, s); Array.Copy(R, 0, sig, sigOff, PointBytes); @@ -644,7 +644,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.OutputFinal(h); Span<byte> k = stackalloc byte[ScalarBytes]; - Scalar448.Reduce(h, k); + Scalar448.Reduce912(h, k); Span<uint> nA = stackalloc uint[ScalarUints]; Scalar448.Decode(k, nA); @@ -683,7 +683,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.OutputFinal(h, 0, h.Length); - byte[] k = Scalar448.Reduce(h); + byte[] k = Scalar448.Reduce912(h); uint[] nA = new uint[ScalarUints]; Scalar448.Decode(k, nA); @@ -740,7 +740,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.OutputFinal(h); Span<byte> k = stackalloc byte[ScalarBytes]; - Scalar448.Reduce(h, k); + Scalar448.Reduce912(h, k); Span<uint> nA = stackalloc uint[ScalarUints]; Scalar448.Decode(k, nA); @@ -778,7 +778,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 d.BlockUpdate(m, mOff, mLen); d.OutputFinal(h, 0, h.Length); - byte[] k = Scalar448.Reduce(h); + byte[] k = Scalar448.Reduce912(h); uint[] nA = new uint[ScalarUints]; Scalar448.Decode(k, nA); diff --git a/crypto/src/math/ec/rfc8032/Scalar25519.cs b/crypto/src/math/ec/rfc8032/Scalar25519.cs index df31929cd..67eee6155 100644 --- a/crypto/src/math/ec/rfc8032/Scalar25519.cs +++ b/crypto/src/math/ec/rfc8032/Scalar25519.cs @@ -16,9 +16,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 private const int ScalarBytes = Size * 4; - private const long M08L = 0x000000FFL; private const long M28L = 0x0FFFFFFFL; - private const long M32L = 0xFFFFFFFFL; private const int TargetLength = 254; @@ -72,7 +70,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER internal static void Multiply128Var(ReadOnlySpan<uint> x, ReadOnlySpan<uint> y128, Span<uint> z) { - Span<uint> tt = stackalloc uint[16]; + Span<uint> tt = stackalloc uint[12]; Nat256.Mul128(x, y128, tt); if ((int)y128[3] < 0) @@ -81,9 +79,20 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 Nat256.SubFrom(x, tt[4..], 0); } - Span<byte> r = MemoryMarshal.AsBytes(tt); - Reduce(r, r); - tt[..Size].CopyTo(z); + if (BitConverter.IsLittleEndian) + { + Span<byte> r = MemoryMarshal.AsBytes(tt); + Reduce384(r, r); + tt[..Size].CopyTo(z); + } + else + { + Span<byte> r = stackalloc byte[48]; + Codec.Encode32(tt, r); + + Reduce384(r, r); + Decode(r, z); + } } #else internal static void Multiply128Var(uint[] x, uint[] y128, uint[] z) @@ -97,40 +106,242 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 Nat256.SubFrom(x, 0, tt, 4, 0); } - byte[] bytes = new byte[64]; + byte[] bytes = new byte[48]; Codec.Encode32(tt, 0, 12, bytes, 0); - byte[] r = Reduce(bytes); + byte[] r = Reduce384(bytes); Decode(r, z); } #endif - internal static byte[] Reduce(byte[] n) + internal static byte[] Reduce384(byte[] n) + { + byte[] r = new byte[ScalarBytes]; + +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + Reduce384(n, r); +#else + long x00 = Codec.Decode32(n, 0); // x00:32/-- + long x01 = (Codec.Decode24(n, 4) << 4); // x01:28/-- + long x02 = Codec.Decode32(n, 7); // x02:32/-- + long x03 = (Codec.Decode24(n, 11) << 4); // x03:28/-- + long x04 = Codec.Decode32(n, 14); // x04:32/-- + long x05 = (Codec.Decode24(n, 18) << 4); // x05:28/-- + long x06 = Codec.Decode32(n, 21); // x06:32/-- + long x07 = (Codec.Decode24(n, 25) << 4); // x07:28/-- + long x08 = Codec.Decode32(n, 28); // x08:32/-- + long x09 = (Codec.Decode24(n, 32) << 4); // x09:28/-- + long x10 = Codec.Decode32(n, 35); // x10:32/-- + long x11 = (Codec.Decode24(n, 39) << 4); // x11:28/-- + long x12 = Codec.Decode32(n, 42); // x12:32/-- + long x13 = (Codec.Decode16(n, 46) << 4); // x13:20/-- + long t; + + // TODO Fix bounds calculations which were copied from Reduce512 + + x13 += (x12 >> 28); x12 &= M28L; // x13:28/22, x12:28/-- + x04 -= x13 * L0; // x04:54/49 + x05 -= x13 * L1; // x05:54/53 + x06 -= x13 * L2; // x06:56/-- + x07 -= x13 * L3; // x07:56/52 + x08 -= x13 * L4; // x08:56/52 + + x12 += (x11 >> 28); x11 &= M28L; // x12:28/24, x11:28/-- + x03 -= x12 * L0; // x03:54/49 + x04 -= x12 * L1; // x04:54/51 + x05 -= x12 * L2; // x05:56/-- + x06 -= x12 * L3; // x06:56/52 + x07 -= x12 * L4; // x07:56/53 + + x11 += (x10 >> 28); x10 &= M28L; // x11:29/--, x10:28/-- + x02 -= x11 * L0; // x02:55/32 + x03 -= x11 * L1; // x03:55/-- + x04 -= x11 * L2; // x04:56/55 + x05 -= x11 * L3; // x05:56/52 + x06 -= x11 * L4; // x06:56/53 + + x10 += (x09 >> 28); x09 &= M28L; // x10:29/--, x09:28/-- + x01 -= x10 * L0; // x01:55/28 + x02 -= x10 * L1; // x02:55/54 + x03 -= x10 * L2; // x03:56/55 + x04 -= x10 * L3; // x04:57/-- + x05 -= x10 * L4; // x05:56/53 + + x08 += (x07 >> 28); x07 &= M28L; // x08:56/53, x07:28/-- + x09 += (x08 >> 28); x08 &= M28L; // x09:29/25, x08:28/-- + + t = (x08 >> 27) & 1L; + x09 += t; // x09:29/26 + + x00 -= x09 * L0; // x00:55/53 + x01 -= x09 * L1; // x01:55/54 + x02 -= x09 * L2; // x02:57/-- + x03 -= x09 * L3; // x03:57/-- + x04 -= x09 * L4; // x04:57/42 + + x01 += (x00 >> 28); x00 &= M28L; + x02 += (x01 >> 28); x01 &= M28L; + x03 += (x02 >> 28); x02 &= M28L; + x04 += (x03 >> 28); x03 &= M28L; + x05 += (x04 >> 28); x04 &= M28L; + x06 += (x05 >> 28); x05 &= M28L; + x07 += (x06 >> 28); x06 &= M28L; + x08 += (x07 >> 28); x07 &= M28L; + x09 = (x08 >> 28); x08 &= M28L; + + x09 -= t; + + Debug.Assert(x09 == 0L || x09 == -1L); + + x00 += x09 & L0; + x01 += x09 & L1; + x02 += x09 & L2; + x03 += x09 & L3; + x04 += x09 & L4; + + x01 += (x00 >> 28); x00 &= M28L; + x02 += (x01 >> 28); x01 &= M28L; + x03 += (x02 >> 28); x02 &= M28L; + x04 += (x03 >> 28); x03 &= M28L; + x05 += (x04 >> 28); x04 &= M28L; + x06 += (x05 >> 28); x05 &= M28L; + x07 += (x06 >> 28); x06 &= M28L; + x08 += (x07 >> 28); x07 &= M28L; + + Codec.Encode56((ulong)(x00 | (x01 << 28)), r, 0); + Codec.Encode56((ulong)(x02 | (x03 << 28)), r, 7); + Codec.Encode56((ulong)(x04 | (x05 << 28)), r, 14); + Codec.Encode56((ulong)(x06 | (x07 << 28)), r, 21); + Codec.Encode32((uint)x08, r, 28); +#endif + + return r; + } + +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + internal static void Reduce384(ReadOnlySpan<byte> n, Span<byte> r) + { + long x00 = Codec.Decode32(n[ 0..]); // x00:32/-- + long x01 = (Codec.Decode24(n[ 4..]) << 4); // x01:28/-- + long x02 = Codec.Decode32(n[ 7..]); // x02:32/-- + long x03 = (Codec.Decode24(n[11..]) << 4); // x03:28/-- + long x04 = Codec.Decode32(n[14..]); // x04:32/-- + long x05 = (Codec.Decode24(n[18..]) << 4); // x05:28/-- + long x06 = Codec.Decode32(n[21..]); // x06:32/-- + long x07 = (Codec.Decode24(n[25..]) << 4); // x07:28/-- + long x08 = Codec.Decode32(n[28..]); // x08:32/-- + long x09 = (Codec.Decode24(n[32..]) << 4); // x09:28/-- + long x10 = Codec.Decode32(n[35..]); // x10:32/-- + long x11 = (Codec.Decode24(n[39..]) << 4); // x11:28/-- + long x12 = Codec.Decode32(n[42..]); // x12:32/-- + long x13 = (Codec.Decode16(n[46..]) << 4); // x13:20/-- + long t; + + // TODO Fix bounds calculations which were copied from Reduce512 + + x13 += (x12 >> 28); x12 &= M28L; // x13:28/22, x12:28/-- + x04 -= x13 * L0; // x04:54/49 + x05 -= x13 * L1; // x05:54/53 + x06 -= x13 * L2; // x06:56/-- + x07 -= x13 * L3; // x07:56/52 + x08 -= x13 * L4; // x08:56/52 + + x12 += (x11 >> 28); x11 &= M28L; // x12:28/24, x11:28/-- + x03 -= x12 * L0; // x03:54/49 + x04 -= x12 * L1; // x04:54/51 + x05 -= x12 * L2; // x05:56/-- + x06 -= x12 * L3; // x06:56/52 + x07 -= x12 * L4; // x07:56/53 + + x11 += (x10 >> 28); x10 &= M28L; // x11:29/--, x10:28/-- + x02 -= x11 * L0; // x02:55/32 + x03 -= x11 * L1; // x03:55/-- + x04 -= x11 * L2; // x04:56/55 + x05 -= x11 * L3; // x05:56/52 + x06 -= x11 * L4; // x06:56/53 + + x10 += (x09 >> 28); x09 &= M28L; // x10:29/--, x09:28/-- + x01 -= x10 * L0; // x01:55/28 + x02 -= x10 * L1; // x02:55/54 + x03 -= x10 * L2; // x03:56/55 + x04 -= x10 * L3; // x04:57/-- + x05 -= x10 * L4; // x05:56/53 + + x08 += (x07 >> 28); x07 &= M28L; // x08:56/53, x07:28/-- + x09 += (x08 >> 28); x08 &= M28L; // x09:29/25, x08:28/-- + + t = (x08 >> 27) & 1L; + x09 += t; // x09:29/26 + + x00 -= x09 * L0; // x00:55/53 + x01 -= x09 * L1; // x01:55/54 + x02 -= x09 * L2; // x02:57/-- + x03 -= x09 * L3; // x03:57/-- + x04 -= x09 * L4; // x04:57/42 + + x01 += (x00 >> 28); x00 &= M28L; + x02 += (x01 >> 28); x01 &= M28L; + x03 += (x02 >> 28); x02 &= M28L; + x04 += (x03 >> 28); x03 &= M28L; + x05 += (x04 >> 28); x04 &= M28L; + x06 += (x05 >> 28); x05 &= M28L; + x07 += (x06 >> 28); x06 &= M28L; + x08 += (x07 >> 28); x07 &= M28L; + x09 = (x08 >> 28); x08 &= M28L; + + x09 -= t; + + Debug.Assert(x09 == 0L || x09 == -1L); + + x00 += x09 & L0; + x01 += x09 & L1; + x02 += x09 & L2; + x03 += x09 & L3; + x04 += x09 & L4; + + x01 += (x00 >> 28); x00 &= M28L; + x02 += (x01 >> 28); x01 &= M28L; + x03 += (x02 >> 28); x02 &= M28L; + x04 += (x03 >> 28); x03 &= M28L; + x05 += (x04 >> 28); x04 &= M28L; + x06 += (x05 >> 28); x05 &= M28L; + x07 += (x06 >> 28); x06 &= M28L; + x08 += (x07 >> 28); x07 &= M28L; + + Codec.Encode56((ulong)(x00 | (x01 << 28)), r); + Codec.Encode56((ulong)(x02 | (x03 << 28)), r[7..]); + Codec.Encode56((ulong)(x04 | (x05 << 28)), r[14..]); + Codec.Encode56((ulong)(x06 | (x07 << 28)), r[21..]); + Codec.Encode32((uint)x08, r[28..]); + } +#endif + + internal static byte[] Reduce512(byte[] n) { byte[] r = new byte[ScalarBytes]; #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - Reduce(n, r); + Reduce512(n, r); #else - long x00 = Codec.Decode32(n, 0) & M32L; // x00:32/-- - long x01 = (Codec.Decode24(n, 4) << 4) & M32L; // x01:28/-- - long x02 = Codec.Decode32(n, 7) & M32L; // x02:32/-- - long x03 = (Codec.Decode24(n, 11) << 4) & M32L; // x03:28/-- - long x04 = Codec.Decode32(n, 14) & M32L; // x04:32/-- - long x05 = (Codec.Decode24(n, 18) << 4) & M32L; // x05:28/-- - long x06 = Codec.Decode32(n, 21) & M32L; // x06:32/-- - long x07 = (Codec.Decode24(n, 25) << 4) & M32L; // x07:28/-- - long x08 = Codec.Decode32(n, 28) & M32L; // x08:32/-- - long x09 = (Codec.Decode24(n, 32) << 4) & M32L; // x09:28/-- - long x10 = Codec.Decode32(n, 35) & M32L; // x10:32/-- - long x11 = (Codec.Decode24(n, 39) << 4) & M32L; // x11:28/-- - long x12 = Codec.Decode32(n, 42) & M32L; // x12:32/-- - long x13 = (Codec.Decode24(n, 46) << 4) & M32L; // x13:28/-- - long x14 = Codec.Decode32(n, 49) & M32L; // x14:32/-- - long x15 = (Codec.Decode24(n, 53) << 4) & M32L; // x15:28/-- - long x16 = Codec.Decode32(n, 56) & M32L; // x16:32/-- - long x17 = (Codec.Decode24(n, 60) << 4) & M32L; // x17:28/-- - long x18 = n[63] & M08L; // x18:08/-- + long x00 = Codec.Decode32(n, 0); // x00:32/-- + long x01 = (Codec.Decode24(n, 4) << 4); // x01:28/-- + long x02 = Codec.Decode32(n, 7); // x02:32/-- + long x03 = (Codec.Decode24(n, 11) << 4); // x03:28/-- + long x04 = Codec.Decode32(n, 14); // x04:32/-- + long x05 = (Codec.Decode24(n, 18) << 4); // x05:28/-- + long x06 = Codec.Decode32(n, 21); // x06:32/-- + long x07 = (Codec.Decode24(n, 25) << 4); // x07:28/-- + long x08 = Codec.Decode32(n, 28); // x08:32/-- + long x09 = (Codec.Decode24(n, 32) << 4); // x09:28/-- + long x10 = Codec.Decode32(n, 35); // x10:32/-- + long x11 = (Codec.Decode24(n, 39) << 4); // x11:28/-- + long x12 = Codec.Decode32(n, 42); // x12:32/-- + long x13 = (Codec.Decode24(n, 46) << 4); // x13:28/-- + long x14 = Codec.Decode32(n, 49); // x14:32/-- + long x15 = (Codec.Decode24(n, 53) << 4); // x15:28/-- + long x16 = Codec.Decode32(n, 56); // x16:32/-- + long x17 = (Codec.Decode24(n, 60) << 4); // x17:28/-- + long x18 = n[63]; // x18:08/-- long t; //x18 += (x17 >> 28); x17 &= M28L; @@ -248,27 +459,27 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - internal static void Reduce(ReadOnlySpan<byte> n, Span<byte> r) + internal static void Reduce512(ReadOnlySpan<byte> n, Span<byte> r) { - long x00 = Codec.Decode32(n[ 0..]) & M32L; // x00:32/-- - long x01 = (Codec.Decode24(n[ 4..]) << 4) & M32L; // x01:28/-- - long x02 = Codec.Decode32(n[ 7..]) & M32L; // x02:32/-- - long x03 = (Codec.Decode24(n[11..]) << 4) & M32L; // x03:28/-- - long x04 = Codec.Decode32(n[14..]) & M32L; // x04:32/-- - long x05 = (Codec.Decode24(n[18..]) << 4) & M32L; // x05:28/-- - long x06 = Codec.Decode32(n[21..]) & M32L; // x06:32/-- - long x07 = (Codec.Decode24(n[25..]) << 4) & M32L; // x07:28/-- - long x08 = Codec.Decode32(n[28..]) & M32L; // x08:32/-- - long x09 = (Codec.Decode24(n[32..]) << 4) & M32L; // x09:28/-- - long x10 = Codec.Decode32(n[35..]) & M32L; // x10:32/-- - long x11 = (Codec.Decode24(n[39..]) << 4) & M32L; // x11:28/-- - long x12 = Codec.Decode32(n[42..]) & M32L; // x12:32/-- - long x13 = (Codec.Decode24(n[46..]) << 4) & M32L; // x13:28/-- - long x14 = Codec.Decode32(n[49..]) & M32L; // x14:32/-- - long x15 = (Codec.Decode24(n[53..]) << 4) & M32L; // x15:28/-- - long x16 = Codec.Decode32(n[56..]) & M32L; // x16:32/-- - long x17 = (Codec.Decode24(n[60..]) << 4) & M32L; // x17:28/-- - long x18 = n[63] & M08L; // x18:08/-- + long x00 = Codec.Decode32(n[ 0..]); // x00:32/-- + long x01 = (Codec.Decode24(n[ 4..]) << 4); // x01:28/-- + long x02 = Codec.Decode32(n[ 7..]); // x02:32/-- + long x03 = (Codec.Decode24(n[11..]) << 4); // x03:28/-- + long x04 = Codec.Decode32(n[14..]); // x04:32/-- + long x05 = (Codec.Decode24(n[18..]) << 4); // x05:28/-- + long x06 = Codec.Decode32(n[21..]); // x06:32/-- + long x07 = (Codec.Decode24(n[25..]) << 4); // x07:28/-- + long x08 = Codec.Decode32(n[28..]); // x08:32/-- + long x09 = (Codec.Decode24(n[32..]) << 4); // x09:28/-- + long x10 = Codec.Decode32(n[35..]); // x10:32/-- + long x11 = (Codec.Decode24(n[39..]) << 4); // x11:28/-- + long x12 = Codec.Decode32(n[42..]); // x12:32/-- + long x13 = (Codec.Decode24(n[46..]) << 4); // x13:28/-- + long x14 = Codec.Decode32(n[49..]); // x14:32/-- + long x15 = (Codec.Decode24(n[53..]) << 4); // x15:28/-- + long x16 = Codec.Decode32(n[56..]); // x16:32/-- + long x17 = (Codec.Decode24(n[60..]) << 4); // x17:28/-- + long x18 = n[63]; // x18:08/-- long t; //x18 += (x17 >> 28); x17 &= M28L; diff --git a/crypto/src/math/ec/rfc8032/Scalar448.cs b/crypto/src/math/ec/rfc8032/Scalar448.cs index 4afe1d2d6..124b91250 100644 --- a/crypto/src/math/ec/rfc8032/Scalar448.cs +++ b/crypto/src/math/ec/rfc8032/Scalar448.cs @@ -97,7 +97,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 { Debug.Assert((int)y225[7] >> 31 == (int)y225[7]); - Span<uint> tt = stackalloc uint[29]; + Span<uint> tt = stackalloc uint[22]; Nat.Mul(y225, x, tt); if ((int)y225[7] < 0) @@ -106,9 +106,20 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 Nat.SubFrom(Size, x, tt[8..]); } - Span<byte> r = MemoryMarshal.AsBytes(tt); - Reduce(r, r); - tt[..Size].CopyTo(z); + if (BitConverter.IsLittleEndian) + { + Span<byte> r = MemoryMarshal.AsBytes(tt); + Reduce704(r, r); + tt[..Size].CopyTo(z); + } + else + { + Span<byte> r = stackalloc byte[88]; + Codec.Encode32(tt, r); + + Reduce704(r, r); + Decode(r, z); + } } #else internal static void Multiply225Var(uint[] x, uint[] y225, uint[] z) @@ -124,20 +135,430 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 Nat.SubFrom(Size, x, 0, tt, 8); } - byte[] bytes = new byte[114]; + byte[] bytes = new byte[88]; Codec.Encode32(tt, 0, 22, bytes, 0); - byte[] r = Reduce(bytes); + byte[] r = Reduce704(bytes); Decode(r, z); } #endif - internal static byte[] Reduce(byte[] n) + internal static byte[] Reduce704(byte[] n) + { + byte[] r = new byte[ScalarBytes]; + +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + Reduce704(n, r); +#else + ulong x00 = Codec.Decode32(n, 0); // x00:32/-- + ulong x01 = (Codec.Decode24(n, 4) << 4); // x01:28/-- + ulong x02 = Codec.Decode32(n, 7); // x02:32/-- + ulong x03 = (Codec.Decode24(n, 11) << 4); // x03:28/-- + ulong x04 = Codec.Decode32(n, 14); // x04:32/-- + ulong x05 = (Codec.Decode24(n, 18) << 4); // x05:28/-- + ulong x06 = Codec.Decode32(n, 21); // x06:32/-- + ulong x07 = (Codec.Decode24(n, 25) << 4); // x07:28/-- + ulong x08 = Codec.Decode32(n, 28); // x08:32/-- + ulong x09 = (Codec.Decode24(n, 32) << 4); // x09:28/-- + ulong x10 = Codec.Decode32(n, 35); // x10:32/-- + ulong x11 = (Codec.Decode24(n, 39) << 4); // x11:28/-- + ulong x12 = Codec.Decode32(n, 42); // x12:32/-- + ulong x13 = (Codec.Decode24(n, 46) << 4); // x13:28/-- + ulong x14 = Codec.Decode32(n, 49); // x14:32/-- + ulong x15 = (Codec.Decode24(n, 53) << 4); // x15:28/-- + ulong x16 = Codec.Decode32(n, 56); // x16:32/-- + ulong x17 = (Codec.Decode24(n, 60) << 4); // x17:28/-- + ulong x18 = Codec.Decode32(n, 63); // x18:32/-- + ulong x19 = (Codec.Decode24(n, 67) << 4); // x19:28/-- + ulong x20 = Codec.Decode32(n, 70); // x20:32/-- + ulong x21 = (Codec.Decode24(n, 74) << 4); // x21:28/-- + ulong x22 = Codec.Decode32(n, 77); // x22:32/-- + ulong x23 = (Codec.Decode24(n, 81) << 4); // x23:28/-- + ulong x24 = Codec.Decode32(n, 84); // x24:32/-- + ulong x25 = 0UL; + + // TODO Fix bounds calculations which were copied from Reduce912 + + x25 += (x24 >> 28); x24 &= M28UL; // x25:28/--, x24:28/-- + x09 += x25 * L4_0; // x09:54/-- + x10 += x25 * L4_1; // x10:54/53 + x11 += x25 * L4_2; // x11:56/-- + x12 += x25 * L4_3; // x12:57/-- + x13 += x25 * L4_4; // x13:57/55 + x14 += x25 * L4_5; // x14:58/-- + x15 += x25 * L4_6; // x15:58/56 + x16 += x25 * L4_7; // x16:59/-- + + x21 += (x20 >> 28); x20 &= M28UL; // x21:58/--, x20:28/-- + x22 += (x21 >> 28); x21 &= M28UL; // x22:57/54, x21:28/-- + x23 += (x22 >> 28); x22 &= M28UL; // x23:45/42, x22:28/-- + x24 += (x23 >> 28); x23 &= M28UL; // x24:28/18, x23:28/-- + + x08 += x24 * L4_0; // x08:54/-- + x09 += x24 * L4_1; // x09:55/-- + x10 += x24 * L4_2; // x10:56/46 + x11 += x24 * L4_3; // x11:57/46 + x12 += x24 * L4_4; // x12:57/55 + x13 += x24 * L4_5; // x13:58/-- + x14 += x24 * L4_6; // x14:58/56 + x15 += x24 * L4_7; // x15:59/-- + + x07 += x23 * L4_0; // x07:54/-- + x08 += x23 * L4_1; // x08:54/53 + x09 += x23 * L4_2; // x09:56/53 + x10 += x23 * L4_3; // x10:57/46 + x11 += x23 * L4_4; // x11:57/55 + x12 += x23 * L4_5; // x12:58/-- + x13 += x23 * L4_6; // x13:58/56 + x14 += x23 * L4_7; // x14:59/-- + + x06 += x22 * L4_0; // x06:54/-- + x07 += x22 * L4_1; // x07:54/53 + x08 += x22 * L4_2; // x08:56/-- + x09 += x22 * L4_3; // x09:57/53 + x10 += x22 * L4_4; // x10:57/55 + x11 += x22 * L4_5; // x11:58/-- + x12 += x22 * L4_6; // x12:58/56 + x13 += x22 * L4_7; // x13:59/-- + + x18 += (x17 >> 28); x17 &= M28UL; // x18:59/31, x17:28/-- + x19 += (x18 >> 28); x18 &= M28UL; // x19:58/54, x18:28/-- + x20 += (x19 >> 28); x19 &= M28UL; // x20:30/29, x19:28/-- + x21 += (x20 >> 28); x20 &= M28UL; // x21:28/03, x20:28/-- + + x05 += x21 * L4_0; // x05:54/-- + x06 += x21 * L4_1; // x06:55/-- + x07 += x21 * L4_2; // x07:56/31 + x08 += x21 * L4_3; // x08:57/31 + x09 += x21 * L4_4; // x09:57/56 + x10 += x21 * L4_5; // x10:58/-- + x11 += x21 * L4_6; // x11:58/56 + x12 += x21 * L4_7; // x12:59/-- + + x04 += x20 * L4_0; // x04:54/-- + x05 += x20 * L4_1; // x05:54/53 + x06 += x20 * L4_2; // x06:56/53 + x07 += x20 * L4_3; // x07:57/31 + x08 += x20 * L4_4; // x08:57/55 + x09 += x20 * L4_5; // x09:58/-- + x10 += x20 * L4_6; // x10:58/56 + x11 += x20 * L4_7; // x11:59/-- + + x03 += x19 * L4_0; // x03:54/-- + x04 += x19 * L4_1; // x04:54/53 + x05 += x19 * L4_2; // x05:56/-- + x06 += x19 * L4_3; // x06:57/53 + x07 += x19 * L4_4; // x07:57/55 + x08 += x19 * L4_5; // x08:58/-- + x09 += x19 * L4_6; // x09:58/56 + x10 += x19 * L4_7; // x10:59/-- + + x15 += (x14 >> 28); x14 &= M28UL; // x15:59/31, x14:28/-- + x16 += (x15 >> 28); x15 &= M28UL; // x16:59/32, x15:28/-- + x17 += (x16 >> 28); x16 &= M28UL; // x17:31/29, x16:28/-- + x18 += (x17 >> 28); x17 &= M28UL; // x18:28/04, x17:28/-- + + x02 += x18 * L4_0; // x02:54/-- + x03 += x18 * L4_1; // x03:55/-- + x04 += x18 * L4_2; // x04:56/32 + x05 += x18 * L4_3; // x05:57/32 + x06 += x18 * L4_4; // x06:57/56 + x07 += x18 * L4_5; // x07:58/-- + x08 += x18 * L4_6; // x08:58/56 + x09 += x18 * L4_7; // x09:59/-- + + x01 += x17 * L4_0; // x01:54/-- + x02 += x17 * L4_1; // x02:54/53 + x03 += x17 * L4_2; // x03:56/53 + x04 += x17 * L4_3; // x04:57/32 + x05 += x17 * L4_4; // x05:57/55 + x06 += x17 * L4_5; // x06:58/-- + x07 += x17 * L4_6; // x07:58/56 + x08 += x17 * L4_7; // x08:59/-- + + x16 *= 4; + x16 += (x15 >> 26); x15 &= M26UL; + x16 += 1; // x16:30/01 + + x00 += x16 * L_0; + x01 += x16 * L_1; + x02 += x16 * L_2; + x03 += x16 * L_3; + x04 += x16 * L_4; + x05 += x16 * L_5; + x06 += x16 * L_6; + x07 += x16 * L_7; + + x01 += (x00 >> 28); x00 &= M28UL; + x02 += (x01 >> 28); x01 &= M28UL; + x03 += (x02 >> 28); x02 &= M28UL; + x04 += (x03 >> 28); x03 &= M28UL; + x05 += (x04 >> 28); x04 &= M28UL; + x06 += (x05 >> 28); x05 &= M28UL; + x07 += (x06 >> 28); x06 &= M28UL; + x08 += (x07 >> 28); x07 &= M28UL; + x09 += (x08 >> 28); x08 &= M28UL; + x10 += (x09 >> 28); x09 &= M28UL; + x11 += (x10 >> 28); x10 &= M28UL; + x12 += (x11 >> 28); x11 &= M28UL; + x13 += (x12 >> 28); x12 &= M28UL; + x14 += (x13 >> 28); x13 &= M28UL; + x15 += (x14 >> 28); x14 &= M28UL; + x16 = (x15 >> 26); x15 &= M26UL; + + x16 -= 1; + + Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue); + + x00 -= x16 & L_0; + x01 -= x16 & L_1; + x02 -= x16 & L_2; + x03 -= x16 & L_3; + x04 -= x16 & L_4; + x05 -= x16 & L_5; + x06 -= x16 & L_6; + x07 -= x16 & L_7; + + x01 += (ulong)((long)x00 >> 28); x00 &= M28UL; + x02 += (ulong)((long)x01 >> 28); x01 &= M28UL; + x03 += (ulong)((long)x02 >> 28); x02 &= M28UL; + x04 += (ulong)((long)x03 >> 28); x03 &= M28UL; + x05 += (ulong)((long)x04 >> 28); x04 &= M28UL; + x06 += (ulong)((long)x05 >> 28); x05 &= M28UL; + x07 += (ulong)((long)x06 >> 28); x06 &= M28UL; + x08 += (ulong)((long)x07 >> 28); x07 &= M28UL; + x09 += (ulong)((long)x08 >> 28); x08 &= M28UL; + x10 += (ulong)((long)x09 >> 28); x09 &= M28UL; + x11 += (ulong)((long)x10 >> 28); x10 &= M28UL; + x12 += (ulong)((long)x11 >> 28); x11 &= M28UL; + x13 += (ulong)((long)x12 >> 28); x12 &= M28UL; + x14 += (ulong)((long)x13 >> 28); x13 &= M28UL; + x15 += (ulong)((long)x14 >> 28); x14 &= M28UL; + + Debug.Assert(x15 >> 26 == 0UL); + + Codec.Encode56(x00 | (x01 << 28), r, 0); + Codec.Encode56(x02 | (x03 << 28), r, 7); + Codec.Encode56(x04 | (x05 << 28), r, 14); + Codec.Encode56(x06 | (x07 << 28), r, 21); + Codec.Encode56(x08 | (x09 << 28), r, 28); + Codec.Encode56(x10 | (x11 << 28), r, 35); + Codec.Encode56(x12 | (x13 << 28), r, 42); + Codec.Encode56(x14 | (x15 << 28), r, 49); + //r[ScalarBytes - 1] = 0; +#endif + + return r; + } + +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + internal static void Reduce704(ReadOnlySpan<byte> n, Span<byte> r) + { + ulong x00 = Codec.Decode32(n[ 0..]); // x00:32/-- + ulong x01 = (Codec.Decode24(n[ 4..]) << 4); // x01:28/-- + ulong x02 = Codec.Decode32(n[ 7..]); // x02:32/-- + ulong x03 = (Codec.Decode24(n[ 11..]) << 4); // x03:28/-- + ulong x04 = Codec.Decode32(n[ 14..]); // x04:32/-- + ulong x05 = (Codec.Decode24(n[ 18..]) << 4); // x05:28/-- + ulong x06 = Codec.Decode32(n[ 21..]); // x06:32/-- + ulong x07 = (Codec.Decode24(n[ 25..]) << 4); // x07:28/-- + ulong x08 = Codec.Decode32(n[ 28..]); // x08:32/-- + ulong x09 = (Codec.Decode24(n[ 32..]) << 4); // x09:28/-- + ulong x10 = Codec.Decode32(n[ 35..]); // x10:32/-- + ulong x11 = (Codec.Decode24(n[ 39..]) << 4); // x11:28/-- + ulong x12 = Codec.Decode32(n[ 42..]); // x12:32/-- + ulong x13 = (Codec.Decode24(n[ 46..]) << 4); // x13:28/-- + ulong x14 = Codec.Decode32(n[ 49..]); // x14:32/-- + ulong x15 = (Codec.Decode24(n[ 53..]) << 4); // x15:28/-- + ulong x16 = Codec.Decode32(n[ 56..]); // x16:32/-- + ulong x17 = (Codec.Decode24(n[ 60..]) << 4); // x17:28/-- + ulong x18 = Codec.Decode32(n[ 63..]); // x18:32/-- + ulong x19 = (Codec.Decode24(n[ 67..]) << 4); // x19:28/-- + ulong x20 = Codec.Decode32(n[ 70..]); // x20:32/-- + ulong x21 = (Codec.Decode24(n[ 74..]) << 4); // x21:28/-- + ulong x22 = Codec.Decode32(n[ 77..]); // x22:32/-- + ulong x23 = (Codec.Decode24(n[ 81..]) << 4); // x23:28/-- + ulong x24 = Codec.Decode32(n[ 84..]); // x24:32/-- + ulong x25 = 0UL; + + // TODO Fix bounds calculations which were copied from Reduce912 + + x25 += (x24 >> 28); x24 &= M28UL; // x25:28/--, x24:28/-- + x09 += x25 * L4_0; // x09:54/-- + x10 += x25 * L4_1; // x10:54/53 + x11 += x25 * L4_2; // x11:56/-- + x12 += x25 * L4_3; // x12:57/-- + x13 += x25 * L4_4; // x13:57/55 + x14 += x25 * L4_5; // x14:58/-- + x15 += x25 * L4_6; // x15:58/56 + x16 += x25 * L4_7; // x16:59/-- + + x21 += (x20 >> 28); x20 &= M28UL; // x21:58/--, x20:28/-- + x22 += (x21 >> 28); x21 &= M28UL; // x22:57/54, x21:28/-- + x23 += (x22 >> 28); x22 &= M28UL; // x23:45/42, x22:28/-- + x24 += (x23 >> 28); x23 &= M28UL; // x24:28/18, x23:28/-- + + x08 += x24 * L4_0; // x08:54/-- + x09 += x24 * L4_1; // x09:55/-- + x10 += x24 * L4_2; // x10:56/46 + x11 += x24 * L4_3; // x11:57/46 + x12 += x24 * L4_4; // x12:57/55 + x13 += x24 * L4_5; // x13:58/-- + x14 += x24 * L4_6; // x14:58/56 + x15 += x24 * L4_7; // x15:59/-- + + x07 += x23 * L4_0; // x07:54/-- + x08 += x23 * L4_1; // x08:54/53 + x09 += x23 * L4_2; // x09:56/53 + x10 += x23 * L4_3; // x10:57/46 + x11 += x23 * L4_4; // x11:57/55 + x12 += x23 * L4_5; // x12:58/-- + x13 += x23 * L4_6; // x13:58/56 + x14 += x23 * L4_7; // x14:59/-- + + x06 += x22 * L4_0; // x06:54/-- + x07 += x22 * L4_1; // x07:54/53 + x08 += x22 * L4_2; // x08:56/-- + x09 += x22 * L4_3; // x09:57/53 + x10 += x22 * L4_4; // x10:57/55 + x11 += x22 * L4_5; // x11:58/-- + x12 += x22 * L4_6; // x12:58/56 + x13 += x22 * L4_7; // x13:59/-- + + x18 += (x17 >> 28); x17 &= M28UL; // x18:59/31, x17:28/-- + x19 += (x18 >> 28); x18 &= M28UL; // x19:58/54, x18:28/-- + x20 += (x19 >> 28); x19 &= M28UL; // x20:30/29, x19:28/-- + x21 += (x20 >> 28); x20 &= M28UL; // x21:28/03, x20:28/-- + + x05 += x21 * L4_0; // x05:54/-- + x06 += x21 * L4_1; // x06:55/-- + x07 += x21 * L4_2; // x07:56/31 + x08 += x21 * L4_3; // x08:57/31 + x09 += x21 * L4_4; // x09:57/56 + x10 += x21 * L4_5; // x10:58/-- + x11 += x21 * L4_6; // x11:58/56 + x12 += x21 * L4_7; // x12:59/-- + + x04 += x20 * L4_0; // x04:54/-- + x05 += x20 * L4_1; // x05:54/53 + x06 += x20 * L4_2; // x06:56/53 + x07 += x20 * L4_3; // x07:57/31 + x08 += x20 * L4_4; // x08:57/55 + x09 += x20 * L4_5; // x09:58/-- + x10 += x20 * L4_6; // x10:58/56 + x11 += x20 * L4_7; // x11:59/-- + + x03 += x19 * L4_0; // x03:54/-- + x04 += x19 * L4_1; // x04:54/53 + x05 += x19 * L4_2; // x05:56/-- + x06 += x19 * L4_3; // x06:57/53 + x07 += x19 * L4_4; // x07:57/55 + x08 += x19 * L4_5; // x08:58/-- + x09 += x19 * L4_6; // x09:58/56 + x10 += x19 * L4_7; // x10:59/-- + + x15 += (x14 >> 28); x14 &= M28UL; // x15:59/31, x14:28/-- + x16 += (x15 >> 28); x15 &= M28UL; // x16:59/32, x15:28/-- + x17 += (x16 >> 28); x16 &= M28UL; // x17:31/29, x16:28/-- + x18 += (x17 >> 28); x17 &= M28UL; // x18:28/04, x17:28/-- + + x02 += x18 * L4_0; // x02:54/-- + x03 += x18 * L4_1; // x03:55/-- + x04 += x18 * L4_2; // x04:56/32 + x05 += x18 * L4_3; // x05:57/32 + x06 += x18 * L4_4; // x06:57/56 + x07 += x18 * L4_5; // x07:58/-- + x08 += x18 * L4_6; // x08:58/56 + x09 += x18 * L4_7; // x09:59/-- + + x01 += x17 * L4_0; // x01:54/-- + x02 += x17 * L4_1; // x02:54/53 + x03 += x17 * L4_2; // x03:56/53 + x04 += x17 * L4_3; // x04:57/32 + x05 += x17 * L4_4; // x05:57/55 + x06 += x17 * L4_5; // x06:58/-- + x07 += x17 * L4_6; // x07:58/56 + x08 += x17 * L4_7; // x08:59/-- + + x16 *= 4; + x16 += (x15 >> 26); x15 &= M26UL; + x16 += 1; // x16:30/01 + + x00 += x16 * L_0; + x01 += x16 * L_1; + x02 += x16 * L_2; + x03 += x16 * L_3; + x04 += x16 * L_4; + x05 += x16 * L_5; + x06 += x16 * L_6; + x07 += x16 * L_7; + + x01 += (x00 >> 28); x00 &= M28UL; + x02 += (x01 >> 28); x01 &= M28UL; + x03 += (x02 >> 28); x02 &= M28UL; + x04 += (x03 >> 28); x03 &= M28UL; + x05 += (x04 >> 28); x04 &= M28UL; + x06 += (x05 >> 28); x05 &= M28UL; + x07 += (x06 >> 28); x06 &= M28UL; + x08 += (x07 >> 28); x07 &= M28UL; + x09 += (x08 >> 28); x08 &= M28UL; + x10 += (x09 >> 28); x09 &= M28UL; + x11 += (x10 >> 28); x10 &= M28UL; + x12 += (x11 >> 28); x11 &= M28UL; + x13 += (x12 >> 28); x12 &= M28UL; + x14 += (x13 >> 28); x13 &= M28UL; + x15 += (x14 >> 28); x14 &= M28UL; + x16 = (x15 >> 26); x15 &= M26UL; + + x16 -= 1; + + Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue); + + x00 -= x16 & L_0; + x01 -= x16 & L_1; + x02 -= x16 & L_2; + x03 -= x16 & L_3; + x04 -= x16 & L_4; + x05 -= x16 & L_5; + x06 -= x16 & L_6; + x07 -= x16 & L_7; + + x01 += (ulong)((long)x00 >> 28); x00 &= M28UL; + x02 += (ulong)((long)x01 >> 28); x01 &= M28UL; + x03 += (ulong)((long)x02 >> 28); x02 &= M28UL; + x04 += (ulong)((long)x03 >> 28); x03 &= M28UL; + x05 += (ulong)((long)x04 >> 28); x04 &= M28UL; + x06 += (ulong)((long)x05 >> 28); x05 &= M28UL; + x07 += (ulong)((long)x06 >> 28); x06 &= M28UL; + x08 += (ulong)((long)x07 >> 28); x07 &= M28UL; + x09 += (ulong)((long)x08 >> 28); x08 &= M28UL; + x10 += (ulong)((long)x09 >> 28); x09 &= M28UL; + x11 += (ulong)((long)x10 >> 28); x10 &= M28UL; + x12 += (ulong)((long)x11 >> 28); x11 &= M28UL; + x13 += (ulong)((long)x12 >> 28); x12 &= M28UL; + x14 += (ulong)((long)x13 >> 28); x13 &= M28UL; + x15 += (ulong)((long)x14 >> 28); x14 &= M28UL; + + Debug.Assert(x15 >> 26 == 0UL); + + Codec.Encode56(x00 | (x01 << 28), r); + Codec.Encode56(x02 | (x03 << 28), r[7..]); + Codec.Encode56(x04 | (x05 << 28), r[14..]); + Codec.Encode56(x06 | (x07 << 28), r[21..]); + Codec.Encode56(x08 | (x09 << 28), r[28..]); + Codec.Encode56(x10 | (x11 << 28), r[35..]); + Codec.Encode56(x12 | (x13 << 28), r[42..]); + Codec.Encode56(x14 | (x15 << 28), r[49..]); + r[ScalarBytes - 1] = 0; + } +#endif + + internal static byte[] Reduce912(byte[] n) { byte[] r = new byte[ScalarBytes]; #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - Reduce(n, r); + Reduce912(n, r); #else ulong x00 = Codec.Decode32(n, 0); // x00:32/-- ulong x01 = (Codec.Decode24(n, 4) << 4); // x01:28/-- @@ -416,7 +837,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032 } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - internal static void Reduce(ReadOnlySpan<byte> n, Span<byte> r) + internal static void Reduce912(ReadOnlySpan<byte> n, Span<byte> r) { ulong x00 = Codec.Decode32(n[ 0..]); // x00:32/-- ulong x01 = (Codec.Decode24(n[ 4..]) << 4); // x01:28/-- |