summary refs log tree commit diff
path: root/crypto/src/math
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2022-11-25 19:12:01 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2022-11-25 19:12:01 +0700
commitdf0e0d95e952954d9c9b8588372b1f194bf329ce (patch)
treeac34a878cc1c31e9744b0d2aff468ab4d4442d56 /crypto/src/math
parentEd25519: cofactored verification (diff)
downloadBouncyCastle.NET-ed25519-df0e0d95e952954d9c9b8588372b1f194bf329ce.tar.xz
Ed448: cofactored verification
- Perf. opts.: Pornin's basis reduction
- factor out Scalar448 class
- factor out ScalarUtilities class
Diffstat (limited to 'crypto/src/math')
-rw-r--r--crypto/src/math/ec/rfc7748/X448Field.cs24
-rw-r--r--crypto/src/math/ec/rfc8032/Ed25519.cs34
-rw-r--r--crypto/src/math/ec/rfc8032/Ed448.cs1021
-rw-r--r--crypto/src/math/ec/rfc8032/Scalar25519.cs338
-rw-r--r--crypto/src/math/ec/rfc8032/Scalar448.cs819
-rw-r--r--crypto/src/math/ec/rfc8032/ScalarUtilities.cs294
6 files changed, 1387 insertions, 1143 deletions
diff --git a/crypto/src/math/ec/rfc7748/X448Field.cs b/crypto/src/math/ec/rfc7748/X448Field.cs
index 1df837d3a..7169bd6d8 100644
--- a/crypto/src/math/ec/rfc7748/X448Field.cs
+++ b/crypto/src/math/ec/rfc7748/X448Field.cs
@@ -241,6 +241,18 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748
         }
 #endif
 
+        public static void Decode(byte[] x, uint[] z)
+        {
+            Decode56(x, 0, z, 0);
+            Decode56(x, 7, z, 2);
+            Decode56(x, 14, z, 4);
+            Decode56(x, 21, z, 6);
+            Decode56(x, 28, z, 8);
+            Decode56(x, 35, z, 10);
+            Decode56(x, 42, z, 12);
+            Decode56(x, 49, z, 14);
+        }
+
         public static void Decode(byte[] x, int xOff, uint[] z)
         {
             Decode56(x, xOff, z, 0);
@@ -369,6 +381,18 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748
         }
 #endif
 
+        public static void Encode(uint[] x, byte[] z)
+        {
+            Encode56(x, 0, z, 0);
+            Encode56(x, 2, z, 7);
+            Encode56(x, 4, z, 14);
+            Encode56(x, 6, z, 21);
+            Encode56(x, 8, z, 28);
+            Encode56(x, 10, z, 35);
+            Encode56(x, 12, z, 42);
+            Encode56(x, 14, z, 49);
+        }
+
         public static void Encode(uint[] x, byte[] z, int zOff)
         {
             Encode56(x, 0, z, zOff);
diff --git a/crypto/src/math/ec/rfc8032/Ed25519.cs b/crypto/src/math/ec/rfc8032/Ed25519.cs
index 0c95fade3..9f9daf39c 100644
--- a/crypto/src/math/ec/rfc8032/Ed25519.cs
+++ b/crypto/src/math/ec/rfc8032/Ed25519.cs
@@ -77,14 +77,15 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         private static readonly int[] C_d4 = { 0x0165E2B2, 0x034DCA13, 0x002ADD7A, 0x01A8283B, 0x00038052, 0x01E7A260,
             0x03407977, 0x019CE331, 0x01C56DFF, 0x00901B67 };
 
-        private const int WnafWidth = 5;
-        private const int WnafWidthBase = 7;
+        //private const int WnafWidth = 5;
+        private const int WnafWidth128 = 4;
+        private const int WnafWidthBase = 6;
 
         // ScalarMultBase is hard-coded for these values of blocks, teeth, spacing so they can't be freely changed
         private const int PrecompBlocks = 8;
         private const int PrecompTeeth = 4;
         private const int PrecompSpacing = 8;
-        //private const int PrecompRange = PrecompBlocks * PrecompTeeth * PrecompSpacing; // range == 256
+        private const int PrecompRange = PrecompBlocks * PrecompTeeth * PrecompSpacing; // range == 256
         private const int PrecompPoints = 1 << (PrecompTeeth - 1);
         private const int PrecompMask = PrecompPoints - 1;
 
@@ -614,8 +615,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             Span<uint> v0 = stackalloc uint[4];
             Span<uint> v1 = stackalloc uint[4];
-            Scalar25519.ReduceBasisVar(nA, v0, v1);
-            Scalar25519.Multiply128Var(nS, v1, nS);
 #else
             byte[] R = Copy(sig, sigOff, PointBytes);
             byte[] S = Copy(sig, sigOff + PointBytes, ScalarBytes);
@@ -658,9 +657,10 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             uint[] v0 = new uint[4];
             uint[] v1 = new uint[4];
+#endif
+
             Scalar25519.ReduceBasisVar(nA, v0, v1);
             Scalar25519.Multiply128Var(nS, v1, nS);
-#endif
 
             Init(out PointAccum pZ);
             ScalarMultStraus128Var(nS, v0, ref pA, v1, ref pR, ref pZ);
@@ -1302,7 +1302,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #endif
 
             Scalar25519.Decode(k, n);
-            Scalar25519.ToSignedDigits(n, n);
+            Scalar25519.ToSignedDigits(256, n, n);
 
             Init(out PointPrecompZ q);
             Init(out PointTemp t);
@@ -1327,7 +1327,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointAccum r)
+        private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointAccum r)
 #else
         private static void ScalarMultBase(byte[] k, ref PointAccum r)
 #endif
@@ -1347,7 +1347,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #endif
 
             Scalar25519.Decode(k, n);
-            Scalar25519.ToSignedDigits(n, n);
+            Scalar25519.ToSignedDigits(PrecompRange, n, n);
             GroupCombBits(n);
 
             Init(out PointPrecomp p);
@@ -1452,9 +1452,11 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #else
             sbyte[] ws_p = new sbyte[253];
 #endif
-            Scalar25519.GetOrderWnafVar(WnafWidth, ws_p);
 
-            int count = 1 << (WnafWidth - 2);
+            // NOTE: WnafWidth128 because of the special structure of the order 
+            Scalar25519.GetOrderWnafVar(WnafWidth128, ws_p);
+
+            int count = 1 << (WnafWidth128 - 2);
             PointPrecompZ[] tp = new PointPrecompZ[count];
             Init(out PointTemp t);
             PointPrecomputeZ(ref p, tp, count, ref t);
@@ -1486,6 +1488,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #endif
         {
             Debug.Assert(nb.Length == ScalarUints);
+            Debug.Assert(nb[ScalarUints - 1] >> 29 == 0U);
             Debug.Assert(np.Length == 4);
             Debug.Assert(nq.Length == 4);
 
@@ -1502,10 +1505,10 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #endif
 
             Wnaf.GetSignedVar(nb, WnafWidthBase, ws_b);
-            Wnaf.GetSignedVar(np, WnafWidth - 1, ws_p);
-            Wnaf.GetSignedVar(nq, WnafWidth - 1, ws_q);
+            Wnaf.GetSignedVar(np, WnafWidth128, ws_p);
+            Wnaf.GetSignedVar(nq, WnafWidth128, ws_q);
 
-            int count = 1 << (WnafWidth - 3);
+            int count = 1 << (WnafWidth128 - 2);
             PointPrecompZ[] tp = new PointPrecompZ[count];
             PointPrecompZ[] tq = new PointPrecompZ[count];
             Init(out PointTemp t);
@@ -1514,7 +1517,8 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             PointSetNeutral(ref r);
 
-            for (int bit = 127; bit >= 0; --bit)
+            int bit = 128;
+            while (--bit >= 0)
             {
                 int wb = ws_b[bit];
                 if (wb != 0)
diff --git a/crypto/src/math/ec/rfc8032/Ed448.cs b/crypto/src/math/ec/rfc8032/Ed448.cs
index cc189615b..7b774896b 100644
--- a/crypto/src/math/ec/rfc8032/Ed448.cs
+++ b/crypto/src/math/ec/rfc8032/Ed448.cs
@@ -31,9 +31,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             Ed448ph = 1,
         }
 
-        private const ulong M26UL = 0x03FFFFFFUL;
-        private const ulong M28UL = 0x0FFFFFFFUL;
-
         private const int CoordUints = 14;
         private const int PointBytes = CoordUints * 4 + 1;
         private const int ScalarUints = 14;
@@ -47,28 +44,9 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         // "SigEd448"
         private static readonly byte[] Dom4Prefix = new byte[]{ 0x53, 0x69, 0x67, 0x45, 0x64, 0x34, 0x34, 0x38 };
 
-        private static readonly uint[] P = { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU,
-            0xFFFFFFFEU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU };
-        private static readonly uint[] L = { 0xAB5844F3U, 0x2378C292U, 0x8DC58F55U, 0x216CC272U, 0xAED63690U, 0xC44EDB49U, 0x7CCA23E9U,
-            0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0x3FFFFFFFU };
-
-        private const int L_0 = 0x04A7BB0D;     // L_0:26/24
-        private const int L_1 = 0x0873D6D5;     // L_1:27/23
-        private const int L_2 = 0x0A70AADC;     // L_2:27/26
-        private const int L_3 = 0x03D8D723;     // L_3:26/--
-        private const int L_4 = 0x096FDE93;     // L_4:27/25
-        private const int L_5 = 0x0B65129C;     // L_5:27/26
-        private const int L_6 = 0x063BB124;     // L_6:27/--
-        private const int L_7 = 0x08335DC1;     // L_7:27/22
-
-        private const int L4_0 = 0x029EEC34;    // L4_0:25/24
-        private const int L4_1 = 0x01CF5B55;    // L4_1:25/--
-        private const int L4_2 = 0x09C2AB72;    // L4_2:27/25
-        private const int L4_3 = 0x0F635C8E;    // L4_3:28/--
-        private const int L4_4 = 0x05BF7A4C;    // L4_4:26/25
-        private const int L4_5 = 0x0D944A72;    // L4_5:28/--
-        private const int L4_6 = 0x08EEC492;    // L4_6:27/24
-        private const int L4_7 = 0x20CD7705;    // L4_7:29/24
+        private static readonly uint[] P = { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU,
+            0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFEU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU,
+            0xFFFFFFFFU };
 
         private static readonly uint[] B_x = { 0x070CC05EU, 0x026A82BCU, 0x00938E26U, 0x080E18B0U, 0x0511433BU,
             0x0F72AB66U, 0x0412AE1AU, 0x0A3D3A46U, 0x0A6DE324U, 0x00F1767EU, 0x04657047U, 0x036DA9E1U, 0x05A622BFU,
@@ -77,17 +55,18 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             0x01CE67C3U, 0x073AD3FFU, 0x005A0C2DU, 0x07789C1EU, 0x0A398408U, 0x0A73736CU, 0x0C7624BEU, 0x003756C9U,
             0x02488762U, 0x016EB6BCU, 0x0693F467U };
 
-        // 2^224 * B
-        private static readonly uint[] B224_x = { 0x091780C7U, 0x0A7EA989U, 0x0D2476B6U, 0x004E4ECCU, 0x0C494B68U,
-            0x00AF9F58U, 0x0DEE64FDU, 0x0E0F269FU, 0x0021BD26U, 0x085A61F6U, 0x0B5D284BU, 0x0C265C35U, 0x03775AFDU,
-            0x058755EAU, 0x02ECF2C6U, 0x0617F174U };
-        private static readonly uint[] B224_y = { 0x05EC556AU, 0x050109E2U, 0x0FD57E39U, 0x0235366BU, 0x044B6B2EU,
-            0x07B3C976U, 0x0B2B7B9CU, 0x0F7F9E82U, 0x00EC6409U, 0x0B6196ABU, 0x00A20D9EU, 0x088F1D16U, 0x0586F761U,
-            0x0e3BE3B4U, 0x0E26395DU, 0x09983C26U };
+        // 2^225 * B
+        private static readonly uint[] B225_x = { 0x06909ee2U, 0x01d7605cU, 0x0995ec8aU, 0x0fc4d970U, 0x0cf2b361U,
+            0x02d82e9dU, 0x01225f55U, 0x007f0ef6U, 0x0aee9c55U, 0x0a240c13U, 0x05627b54U, 0x0d449d1eU, 0x03a44575U,
+            0x007164a7U, 0x0bd4bd71U, 0x061a15fdU };
+        private static readonly uint[] B225_y = { 0x0d3a9fe4U, 0x030696b9U, 0x07e7e326U, 0x068308c7U, 0x0ce0b8c8U,
+            0x03ac222bU, 0x0304db8eU, 0x083ee319U, 0x05e5db0bU, 0x0eca503bU, 0x0b1c6539U, 0x078a8dceU, 0x02d256bcU,
+            0x04a8b05eU, 0x0bd9fd57U, 0x0a1c3cb8U };
 
         private const int C_d = -39081;
 
-        private const int WnafWidth = 5;
+        //private const int WnafWidth = 6;
+        private const int WnafWidth225 = 5;
         private const int WnafWidthBase = 7;
 
         // ScalarMultBase supports varying blocks, teeth, spacing so long as their product is in range [449, 479]
@@ -100,7 +79,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         private static readonly object PrecompLock = new object();
         private static PointAffine[] PrecompBaseWnaf = null;
-        private static PointAffine[] PrecompBase224Wnaf = null;
+        private static PointAffine[] PrecompBase225Wnaf = null;
         private static uint[] PrecompBaseComb = null;
 
         private struct PointAffine
@@ -115,9 +94,9 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         private static byte[] CalculateS(byte[] r, byte[] k, byte[] s)
         {
-            uint[] t = new uint[ScalarUints * 2];   DecodeScalar(r, 0, t);
-            uint[] u = new uint[ScalarUints];       DecodeScalar(k, 0, u);
-            uint[] v = new uint[ScalarUints];       DecodeScalar(s, 0, v);
+            uint[] t = new uint[ScalarUints * 2];   Scalar448.Decode(r, t);
+            uint[] u = new uint[ScalarUints];       Scalar448.Decode(k, u);
+            uint[] v = new uint[ScalarUints];       Scalar448.Decode(s, v);
 
             Nat.MulAddTo(ScalarUints, u, v, t);
 
@@ -126,7 +105,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             {
                 Codec.Encode32(t[i], result, i * 4);
             }
-            return ReduceScalar(result);
+            return Scalar448.Reduce(result);
         }
 
         private static bool CheckContextVar(byte[] ctx)
@@ -208,20 +187,20 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 #endif
 
-
-        private static bool CheckPointFullVar(byte[] p)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static bool CheckPointFullVar(ReadOnlySpan<byte> p)
         {
             if ((p[PointBytes - 1] & 0x7F) != 0x00)
                 return false;
 
-            uint y13 = Codec.Decode32(p, 52);
+            uint y13 = Codec.Decode32(p[52..]);
 
             uint t0 = y13;
             uint t1 = y13 ^ P[13];
 
             for (int i = CoordUints - 2; i > 0; --i)
             {
-                uint yi = Codec.Decode32(p, i * 4);
+                uint yi = Codec.Decode32(p[(i * 4)..]);
 
                 // Reject non-canonical encodings (i.e. >= P)
                 if (t1 == 0 && yi > P[i])
@@ -231,7 +210,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 t1 |= yi ^ P[i];
             }
 
-            uint y0 = Codec.Decode32(p, 0);
+            uint y0 = Codec.Decode32(p);
 
             // Reject 0 and 1
             if (t0 == 0 && y0 <= 1U)
@@ -243,24 +222,40 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             return true;
         }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static bool CheckScalarVar(ReadOnlySpan<byte> s, Span<uint> n)
+#else
+        private static bool CheckPointFullVar(byte[] p)
         {
-            if (s[ScalarBytes - 1] != 0x00)
+            if ((p[PointBytes - 1] & 0x7F) != 0x00)
                 return false;
 
-            DecodeScalar(s, n);
-            return !Nat.Gte(ScalarUints, n, L);
-        }
-#else
-        private static bool CheckScalarVar(byte[] s, uint[] n)
-        {
-            if (s[ScalarBytes - 1] != 0x00)
+            uint y13 = Codec.Decode32(p, 52);
+
+            uint t0 = y13;
+            uint t1 = y13 ^ P[13];
+
+            for (int i = CoordUints - 2; i > 0; --i)
+            {
+                uint yi = Codec.Decode32(p, i * 4);
+
+                // Reject non-canonical encodings (i.e. >= P)
+                if (t1 == 0 && yi > P[i])
+                    return false;
+
+                t0 |= yi;
+                t1 |= yi ^ P[i];
+            }
+
+            uint y0 = Codec.Decode32(p, 0);
+
+            // Reject 0 and 1
+            if (t0 == 0 && y0 <= 1U)
                 return false;
 
-            DecodeScalar(s, 0, n);
-            return !Nat.Gte(ScalarUints, n, L);
+            // Reject P - 1 and non-canonical encodings (i.e. >= P)
+            if (t1 == 0 && y0 >= (P[0] - 1U))
+                return false;
+
+            return true;
         }
 #endif
 
@@ -281,16 +276,15 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             return new ShakeDigest(256);
         }
 
-        private static bool DecodePointVar(byte[] p, int pOff, bool negate, ref PointProjective r)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static bool DecodePointVar(ReadOnlySpan<byte> p, bool negate, ref PointProjective r)
+#else
+        private static bool DecodePointVar(byte[] p, bool negate, ref PointProjective r)
+#endif
         {
-            byte[] py = Copy(p, pOff, PointBytes);
-            if (!CheckPointFullVar(py))
-                return false;
-
-            int x_0 = (py[PointBytes - 1] & 0x80) >> 7;
-            py[PointBytes - 1] &= 0x7F;
+            int x_0 = (p[PointBytes - 1] & 0x80) >> 7;
 
-            F.Decode(py, 0, r.y);
+            F.Decode(p, r.y);
 
             uint[] u = F.Create();
             uint[] v = F.Create();
@@ -317,22 +311,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             return true;
         }
 
-        private static void DecodeScalar(byte[] k, int kOff, uint[] n)
-        {
-            Debug.Assert(k[kOff + ScalarBytes - 1] == 0x00);
-
-            Codec.Decode32(k, kOff, n, 0, ScalarUints);
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void DecodeScalar(ReadOnlySpan<byte> k, Span<uint> n)
-        {
-            Debug.Assert(k[ScalarBytes - 1] == 0x00);
-
-            Codec.Decode32(k, n[..ScalarUints]);
-        }
-#endif
-
         private static void Dom4(IXof d, byte phflag, byte[] ctx)
         {
             int n = Dom4Prefix.Length;
@@ -466,7 +444,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             d.BlockUpdate(m, mOff, mLen);
             d.OutputFinal(h, 0, h.Length);
 
-            byte[] r = ReduceScalar(h);
+            byte[] r = Scalar448.Reduce(h);
             byte[] R = new byte[PointBytes];
             ScalarMultBaseEncoded(r, R, 0);
 
@@ -476,7 +454,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             d.BlockUpdate(m, mOff, mLen);
             d.OutputFinal(h, 0, h.Length);
 
-            byte[] k = ReduceScalar(h);
+            byte[] k = Scalar448.Reduce(h);
             byte[] S = CalculateS(r, k, s);
 
             Array.Copy(R, 0, sig, sigOff, PointBytes);
@@ -529,21 +507,30 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 throw new ArgumentException("ctx");
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            Span<byte> RS = stackalloc byte[PointBytes + ScalarBytes];
-            RS.CopyFrom(sig.AsSpan(sigOff, PointBytes + ScalarBytes));
+            Span<byte> signature = stackalloc byte[SignatureSize];
+            signature.CopyFrom(sig.AsSpan(sigOff, SignatureSize));
+            var R = signature[..PointBytes];
+            var S = signature[PointBytes..];
 
-            var R = RS[..PointBytes];
-            var S = RS[PointBytes..];
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
 
             if (!CheckPointVar(R))
                 return false;
 
             Span<uint> nS = stackalloc uint[ScalarUints];
-            if (!CheckScalarVar(S, nS))
+            if (!Scalar448.CheckVar(S, nS))
+                return false;
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointProjective pR);
+            if (!DecodePointVar(R, true, ref pR))
                 return false;
 
             Init(out PointProjective pA);
-            if (!DecodePointVar(pk, pkOff, true, ref pA))
+            if (!DecodePointVar(A, true, ref pA))
                 return false;
 
             IXof d = CreateXof();
@@ -551,34 +538,39 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             Dom4(d, phflag, ctx);
             d.BlockUpdate(R);
-            d.BlockUpdate(pk.AsSpan(pkOff, PointBytes));
+            d.BlockUpdate(A);
             d.BlockUpdate(m.AsSpan(mOff, mLen));
             d.OutputFinal(h);
 
             Span<byte> k = stackalloc byte[ScalarBytes];
-            ReduceScalar(h, k);
+            Scalar448.Reduce(h, k);
 
             Span<uint> nA = stackalloc uint[ScalarUints];
-            DecodeScalar(k, nA);
-
-            Init(out PointProjective pR);
-            ScalarMultStrausVar(nS, nA, ref pA, ref pR);
+            Scalar448.Decode(k, nA);
 
-            Span<byte> check = stackalloc byte[PointBytes];
-            return 0 != EncodePoint(ref pR, check) && check.SequenceEqual(R);
+            Span<uint> v0 = stackalloc uint[8];
+            Span<uint> v1 = stackalloc uint[8];
 #else
             byte[] R = Copy(sig, sigOff, PointBytes);
             byte[] S = Copy(sig, sigOff + PointBytes, ScalarBytes);
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
 
             if (!CheckPointVar(R))
                 return false;
 
             uint[] nS = new uint[ScalarUints];
-            if (!CheckScalarVar(S, nS))
+            if (!Scalar448.CheckVar(S, nS))
+                return false;
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointProjective pR);
+            if (!DecodePointVar(R, true, ref pR))
                 return false;
 
             Init(out PointProjective pA);
-            if (!DecodePointVar(pk, pkOff, true, ref pA))
+            if (!DecodePointVar(A, true, ref pA))
                 return false;
 
             IXof d = CreateXof();
@@ -586,21 +578,30 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             Dom4(d, phflag, ctx);
             d.BlockUpdate(R, 0, PointBytes);
-            d.BlockUpdate(pk, pkOff, PointBytes);
+            d.BlockUpdate(A, 0, PointBytes);
             d.BlockUpdate(m, mOff, mLen);
             d.OutputFinal(h, 0, h.Length);
 
-            byte[] k = ReduceScalar(h);
+            byte[] k = Scalar448.Reduce(h);
 
             uint[] nA = new uint[ScalarUints];
-            DecodeScalar(k, 0, nA);
+            Scalar448.Decode(k, nA);
 
-            Init(out PointProjective pR);
-            ScalarMultStrausVar(nS, nA, ref pA, ref pR);
-
-            byte[] check = new byte[PointBytes];
-            return 0 != EncodePoint(ref pR, check, 0) && Arrays.AreEqual(check, R);
+            uint[] v0 = new uint[8];
+            uint[] v1 = new uint[8];
 #endif
+
+            Scalar448.ReduceBasisVar(nA, v0, v1);
+            Scalar448.Multiply225Var(nS, v1, nS);
+
+            Init(out PointProjective pZ);
+            ScalarMultStraus225Var(nS, v0, ref pA, v1, ref pR, ref pZ);
+
+            F.Normalize(pZ.x);
+            F.Normalize(pZ.y);
+            F.Normalize(pZ.z);
+
+            return IsNeutralElementVar(pZ.x, pZ.y, pZ.z);
         }
 
         private static void Init(out PointAffine r)
@@ -996,12 +997,12 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
                 PointPrecomputeVar(ref p, points, 0, wnafPoints);
 
-                Init(out PointProjective p224);
-                F.Copy(B224_x, 0, p224.x, 0);
-                F.Copy(B224_y, 0, p224.y, 0);
-                F.One(p224.z);
+                Init(out PointProjective p225);
+                F.Copy(B225_x, 0, p225.x, 0);
+                F.Copy(B225_y, 0, p225.y, 0);
+                F.One(p225.z);
 
-                PointPrecomputeVar(ref p224, points, wnafPoints, wnafPoints);
+                PointPrecomputeVar(ref p225, points, wnafPoints, wnafPoints);
 
                 int pointsIndex = wnafPoints * 2;
                 PointProjective[] toothPowers = new PointProjective[PrecompTeeth];
@@ -1065,11 +1066,11 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                     F.Mul(q.y, q.z, r.y);       F.Normalize(r.y);
                 }
 
-                PrecompBase224Wnaf = new PointAffine[wnafPoints];
+                PrecompBase225Wnaf = new PointAffine[wnafPoints];
                 for (int i = 0; i < wnafPoints; ++i)
                 {
                     ref PointProjective q = ref points[wnafPoints + i];
-                    ref PointAffine r = ref PrecompBase224Wnaf[i];
+                    ref PointAffine r = ref PrecompBase225Wnaf[i];
                     Init(out r);
 
                     F.Mul(q.x, q.z, r.x);       F.Normalize(r.x);
@@ -1112,619 +1113,23 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 #endif
 
-        private static byte[] ReduceScalar(byte[] n)
-        {
-            byte[] r = new byte[ScalarBytes];
-
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ReduceScalar(n, r);
+        private static void ScalarMult(ReadOnlySpan<byte> k, ref PointProjective p, ref PointProjective r)
 #else
-            ulong x00 =  Codec.Decode32(n,   0);                // x00:32/--
-            ulong x01 = (Codec.Decode24(n,   4) << 4);          // x01:28/--
-            ulong x02 =  Codec.Decode32(n,   7);                // x02:32/--
-            ulong x03 = (Codec.Decode24(n,  11) << 4);          // x03:28/--
-            ulong x04 =  Codec.Decode32(n,  14);                // x04:32/--
-            ulong x05 = (Codec.Decode24(n,  18) << 4);          // x05:28/--
-            ulong x06 =  Codec.Decode32(n,  21);                // x06:32/--
-            ulong x07 = (Codec.Decode24(n,  25) << 4);          // x07:28/--
-            ulong x08 =  Codec.Decode32(n,  28);                // x08:32/--
-            ulong x09 = (Codec.Decode24(n,  32) << 4);          // x09:28/--
-            ulong x10 =  Codec.Decode32(n,  35);                // x10:32/--
-            ulong x11 = (Codec.Decode24(n,  39) << 4);          // x11:28/--
-            ulong x12 =  Codec.Decode32(n,  42);                // x12:32/--
-            ulong x13 = (Codec.Decode24(n,  46) << 4);          // x13:28/--
-            ulong x14 =  Codec.Decode32(n,  49);                // x14:32/--
-            ulong x15 = (Codec.Decode24(n,  53) << 4);          // x15:28/--
-            ulong x16 =  Codec.Decode32(n,  56);                // x16:32/--
-            ulong x17 = (Codec.Decode24(n,  60) << 4);          // x17:28/--
-            ulong x18 =  Codec.Decode32(n,  63);                // x18:32/--
-            ulong x19 = (Codec.Decode24(n,  67) << 4);          // x19:28/--
-            ulong x20 =  Codec.Decode32(n,  70);                // x20:32/--
-            ulong x21 = (Codec.Decode24(n,  74) << 4);          // x21:28/--
-            ulong x22 =  Codec.Decode32(n,  77);                // x22:32/--
-            ulong x23 = (Codec.Decode24(n,  81) << 4);          // x23:28/--
-            ulong x24 =  Codec.Decode32(n,  84);                // x24:32/--
-            ulong x25 = (Codec.Decode24(n,  88) << 4);          // x25:28/--
-            ulong x26 =  Codec.Decode32(n,  91);                // x26:32/--
-            ulong x27 = (Codec.Decode24(n,  95) << 4);          // x27:28/--
-            ulong x28 =  Codec.Decode32(n,  98);                // x28:32/--
-            ulong x29 = (Codec.Decode24(n, 102) << 4);          // x29:28/--
-            ulong x30 =  Codec.Decode32(n, 105);                // x30:32/--
-            ulong x31 = (Codec.Decode24(n, 109) << 4);          // x31:28/--
-            ulong x32 =  Codec.Decode16(n, 112);                // x32:16/--
-
-            //x32 += (x31 >> 28); x31 &= M28UL;
-            x16 += x32 * L4_0;                          // x16:42/--
-            x17 += x32 * L4_1;                          // x17:41/28
-            x18 += x32 * L4_2;                          // x18:43/42
-            x19 += x32 * L4_3;                          // x19:44/28
-            x20 += x32 * L4_4;                          // x20:43/--
-            x21 += x32 * L4_5;                          // x21:44/28
-            x22 += x32 * L4_6;                          // x22:43/41
-            x23 += x32 * L4_7;                          // x23:45/41
-
-            x31 += (x30 >> 28); x30 &= M28UL;           // x31:28/--, x30:28/--
-            x15 += x31 * L4_0;                          // x15:54/--
-            x16 += x31 * L4_1;                          // x16:53/42
-            x17 += x31 * L4_2;                          // x17:55/54
-            x18 += x31 * L4_3;                          // x18:56/44
-            x19 += x31 * L4_4;                          // x19:55/--
-            x20 += x31 * L4_5;                          // x20:56/43
-            x21 += x31 * L4_6;                          // x21:55/53
-            x22 += x31 * L4_7;                          // x22:57/53
-
-            //x30 += (x29 >> 28); x29 &= M28UL;
-            x14 += x30 * L4_0;                          // x14:54/--
-            x15 += x30 * L4_1;                          // x15:54/53
-            x16 += x30 * L4_2;                          // x16:56/--
-            x17 += x30 * L4_3;                          // x17:57/--
-            x18 += x30 * L4_4;                          // x18:56/55
-            x19 += x30 * L4_5;                          // x19:56/55
-            x20 += x30 * L4_6;                          // x20:57/--
-            x21 += x30 * L4_7;                          // x21:57/56
-
-            x29 += (x28 >> 28); x28 &= M28UL;           // x29:28/--, x28:28/--
-            x13 += x29 * L4_0;                          // x13:54/--
-            x14 += x29 * L4_1;                          // x14:54/53
-            x15 += x29 * L4_2;                          // x15:56/--
-            x16 += x29 * L4_3;                          // x16:57/--
-            x17 += x29 * L4_4;                          // x17:57/55
-            x18 += x29 * L4_5;                          // x18:57/55
-            x19 += x29 * L4_6;                          // x19:57/52
-            x20 += x29 * L4_7;                          // x20:58/52
-
-            //x28 += (x27 >> 28); x27 &= M28UL;
-            x12 += x28 * L4_0;                          // x12:54/--
-            x13 += x28 * L4_1;                          // x13:54/53
-            x14 += x28 * L4_2;                          // x14:56/--
-            x15 += x28 * L4_3;                          // x15:57/--
-            x16 += x28 * L4_4;                          // x16:57/55
-            x17 += x28 * L4_5;                          // x17:58/--
-            x18 += x28 * L4_6;                          // x18:58/--
-            x19 += x28 * L4_7;                          // x19:58/53
-
-            x27 += (x26 >> 28); x26 &= M28UL;           // x27:28/--, x26:28/--
-            x11 += x27 * L4_0;                          // x11:54/--
-            x12 += x27 * L4_1;                          // x12:54/53
-            x13 += x27 * L4_2;                          // x13:56/--
-            x14 += x27 * L4_3;                          // x14:57/--
-            x15 += x27 * L4_4;                          // x15:57/55
-            x16 += x27 * L4_5;                          // x16:58/--
-            x17 += x27 * L4_6;                          // x17:58/56
-            x18 += x27 * L4_7;                          // x18:59/--
-
-            //x26 += (x25 >> 28); x25 &= M28UL;
-            x10 += x26 * L4_0;                          // x10:54/--
-            x11 += x26 * L4_1;                          // x11:54/53
-            x12 += x26 * L4_2;                          // x12:56/--
-            x13 += x26 * L4_3;                          // x13:57/--
-            x14 += x26 * L4_4;                          // x14:57/55
-            x15 += x26 * L4_5;                          // x15:58/--
-            x16 += x26 * L4_6;                          // x16:58/56
-            x17 += x26 * L4_7;                          // x17:59/--
-
-            x25 += (x24 >> 28); x24 &= M28UL;           // x25:28/--, x24:28/--
-            x09 += x25 * L4_0;                          // x09:54/--
-            x10 += x25 * L4_1;                          // x10:54/53
-            x11 += x25 * L4_2;                          // x11:56/--
-            x12 += x25 * L4_3;                          // x12:57/--
-            x13 += x25 * L4_4;                          // x13:57/55
-            x14 += x25 * L4_5;                          // x14:58/--
-            x15 += x25 * L4_6;                          // x15:58/56
-            x16 += x25 * L4_7;                          // x16:59/--
-
-            x21 += (x20 >> 28); x20 &= M28UL;           // x21:58/--, x20:28/--
-            x22 += (x21 >> 28); x21 &= M28UL;           // x22:57/54, x21:28/--
-            x23 += (x22 >> 28); x22 &= M28UL;           // x23:45/42, x22:28/--
-            x24 += (x23 >> 28); x23 &= M28UL;           // x24:28/18, x23:28/--
-
-            x08 += x24 * L4_0;                          // x08:54/--
-            x09 += x24 * L4_1;                          // x09:55/--
-            x10 += x24 * L4_2;                          // x10:56/46
-            x11 += x24 * L4_3;                          // x11:57/46
-            x12 += x24 * L4_4;                          // x12:57/55
-            x13 += x24 * L4_5;                          // x13:58/--
-            x14 += x24 * L4_6;                          // x14:58/56
-            x15 += x24 * L4_7;                          // x15:59/--
-
-            x07 += x23 * L4_0;                          // x07:54/--
-            x08 += x23 * L4_1;                          // x08:54/53
-            x09 += x23 * L4_2;                          // x09:56/53
-            x10 += x23 * L4_3;                          // x10:57/46
-            x11 += x23 * L4_4;                          // x11:57/55
-            x12 += x23 * L4_5;                          // x12:58/--
-            x13 += x23 * L4_6;                          // x13:58/56
-            x14 += x23 * L4_7;                          // x14:59/--
-
-            x06 += x22 * L4_0;                          // x06:54/--
-            x07 += x22 * L4_1;                          // x07:54/53
-            x08 += x22 * L4_2;                          // x08:56/--
-            x09 += x22 * L4_3;                          // x09:57/53
-            x10 += x22 * L4_4;                          // x10:57/55
-            x11 += x22 * L4_5;                          // x11:58/--
-            x12 += x22 * L4_6;                          // x12:58/56
-            x13 += x22 * L4_7;                          // x13:59/--
-
-            x18 += (x17 >> 28); x17 &= M28UL;           // x18:59/31, x17:28/--
-            x19 += (x18 >> 28); x18 &= M28UL;           // x19:58/54, x18:28/--
-            x20 += (x19 >> 28); x19 &= M28UL;           // x20:30/29, x19:28/--
-            x21 += (x20 >> 28); x20 &= M28UL;           // x21:28/03, x20:28/--
-
-            x05 += x21 * L4_0;                          // x05:54/--
-            x06 += x21 * L4_1;                          // x06:55/--
-            x07 += x21 * L4_2;                          // x07:56/31
-            x08 += x21 * L4_3;                          // x08:57/31
-            x09 += x21 * L4_4;                          // x09:57/56
-            x10 += x21 * L4_5;                          // x10:58/--
-            x11 += x21 * L4_6;                          // x11:58/56
-            x12 += x21 * L4_7;                          // x12:59/--
-
-            x04 += x20 * L4_0;                          // x04:54/--
-            x05 += x20 * L4_1;                          // x05:54/53
-            x06 += x20 * L4_2;                          // x06:56/53
-            x07 += x20 * L4_3;                          // x07:57/31
-            x08 += x20 * L4_4;                          // x08:57/55
-            x09 += x20 * L4_5;                          // x09:58/--
-            x10 += x20 * L4_6;                          // x10:58/56
-            x11 += x20 * L4_7;                          // x11:59/--
-
-            x03 += x19 * L4_0;                          // x03:54/--
-            x04 += x19 * L4_1;                          // x04:54/53
-            x05 += x19 * L4_2;                          // x05:56/--
-            x06 += x19 * L4_3;                          // x06:57/53
-            x07 += x19 * L4_4;                          // x07:57/55
-            x08 += x19 * L4_5;                          // x08:58/--
-            x09 += x19 * L4_6;                          // x09:58/56
-            x10 += x19 * L4_7;                          // x10:59/--
-
-            x15 += (x14 >> 28); x14 &= M28UL;           // x15:59/31, x14:28/--
-            x16 += (x15 >> 28); x15 &= M28UL;           // x16:59/32, x15:28/--
-            x17 += (x16 >> 28); x16 &= M28UL;           // x17:31/29, x16:28/--
-            x18 += (x17 >> 28); x17 &= M28UL;           // x18:28/04, x17:28/--
-
-            x02 += x18 * L4_0;                          // x02:54/--
-            x03 += x18 * L4_1;                          // x03:55/--
-            x04 += x18 * L4_2;                          // x04:56/32
-            x05 += x18 * L4_3;                          // x05:57/32
-            x06 += x18 * L4_4;                          // x06:57/56
-            x07 += x18 * L4_5;                          // x07:58/--
-            x08 += x18 * L4_6;                          // x08:58/56
-            x09 += x18 * L4_7;                          // x09:59/--
-
-            x01 += x17 * L4_0;                          // x01:54/--
-            x02 += x17 * L4_1;                          // x02:54/53
-            x03 += x17 * L4_2;                          // x03:56/53
-            x04 += x17 * L4_3;                          // x04:57/32
-            x05 += x17 * L4_4;                          // x05:57/55
-            x06 += x17 * L4_5;                          // x06:58/--
-            x07 += x17 * L4_6;                          // x07:58/56
-            x08 += x17 * L4_7;                          // x08:59/--
-
-            x16 *= 4;
-            x16 += (x15 >> 26); x15 &= M26UL;
-            x16 += 1;                                   // x16:30/01
-
-            x00 += x16 * L_0;
-            x01 += x16 * L_1;
-            x02 += x16 * L_2;
-            x03 += x16 * L_3;
-            x04 += x16 * L_4;
-            x05 += x16 * L_5;
-            x06 += x16 * L_6;
-            x07 += x16 * L_7;
-
-            x01 += (x00 >> 28); x00 &= M28UL;
-            x02 += (x01 >> 28); x01 &= M28UL;
-            x03 += (x02 >> 28); x02 &= M28UL;
-            x04 += (x03 >> 28); x03 &= M28UL;
-            x05 += (x04 >> 28); x04 &= M28UL;
-            x06 += (x05 >> 28); x05 &= M28UL;
-            x07 += (x06 >> 28); x06 &= M28UL;
-            x08 += (x07 >> 28); x07 &= M28UL;
-            x09 += (x08 >> 28); x08 &= M28UL;
-            x10 += (x09 >> 28); x09 &= M28UL;
-            x11 += (x10 >> 28); x10 &= M28UL;
-            x12 += (x11 >> 28); x11 &= M28UL;
-            x13 += (x12 >> 28); x12 &= M28UL;
-            x14 += (x13 >> 28); x13 &= M28UL;
-            x15 += (x14 >> 28); x14 &= M28UL;
-            x16  = (x15 >> 26); x15 &= M26UL;
-
-            x16 -= 1;
-
-            Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue);
-
-            x00 -= x16 & L_0;
-            x01 -= x16 & L_1;
-            x02 -= x16 & L_2;
-            x03 -= x16 & L_3;
-            x04 -= x16 & L_4;
-            x05 -= x16 & L_5;
-            x06 -= x16 & L_6;
-            x07 -= x16 & L_7;
-
-            x01 += (ulong)((long)x00 >> 28); x00 &= M28UL;
-            x02 += (ulong)((long)x01 >> 28); x01 &= M28UL;
-            x03 += (ulong)((long)x02 >> 28); x02 &= M28UL;
-            x04 += (ulong)((long)x03 >> 28); x03 &= M28UL;
-            x05 += (ulong)((long)x04 >> 28); x04 &= M28UL;
-            x06 += (ulong)((long)x05 >> 28); x05 &= M28UL;
-            x07 += (ulong)((long)x06 >> 28); x06 &= M28UL;
-            x08 += (ulong)((long)x07 >> 28); x07 &= M28UL;
-            x09 += (ulong)((long)x08 >> 28); x08 &= M28UL;
-            x10 += (ulong)((long)x09 >> 28); x09 &= M28UL;
-            x11 += (ulong)((long)x10 >> 28); x10 &= M28UL;
-            x12 += (ulong)((long)x11 >> 28); x11 &= M28UL;
-            x13 += (ulong)((long)x12 >> 28); x12 &= M28UL;
-            x14 += (ulong)((long)x13 >> 28); x13 &= M28UL;
-            x15 += (ulong)((long)x14 >> 28); x14 &= M28UL;
-
-            Debug.Assert(x15 >> 26 == 0UL);
-
-            Codec.Encode56(x00 | (x01 << 28), r,  0);
-            Codec.Encode56(x02 | (x03 << 28), r,  7);
-            Codec.Encode56(x04 | (x05 << 28), r, 14);
-            Codec.Encode56(x06 | (x07 << 28), r, 21);
-            Codec.Encode56(x08 | (x09 << 28), r, 28);
-            Codec.Encode56(x10 | (x11 << 28), r, 35);
-            Codec.Encode56(x12 | (x13 << 28), r, 42);
-            Codec.Encode56(x14 | (x15 << 28), r, 49);
-            //r[ScalarBytes - 1] = 0;
-#endif
-
-            return r;
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ReduceScalar(ReadOnlySpan<byte> n, Span<byte> r)
-        {
-            ulong x00 =  Codec.Decode32(n[  0..]);              // x00:32/--
-            ulong x01 = (Codec.Decode24(n[  4..]) << 4);        // x01:28/--
-            ulong x02 =  Codec.Decode32(n[  7..]);              // x02:32/--
-            ulong x03 = (Codec.Decode24(n[ 11..]) << 4);        // x03:28/--
-            ulong x04 =  Codec.Decode32(n[ 14..]);              // x04:32/--
-            ulong x05 = (Codec.Decode24(n[ 18..]) << 4);        // x05:28/--
-            ulong x06 =  Codec.Decode32(n[ 21..]);              // x06:32/--
-            ulong x07 = (Codec.Decode24(n[ 25..]) << 4);        // x07:28/--
-            ulong x08 =  Codec.Decode32(n[ 28..]);              // x08:32/--
-            ulong x09 = (Codec.Decode24(n[ 32..]) << 4);        // x09:28/--
-            ulong x10 =  Codec.Decode32(n[ 35..]);              // x10:32/--
-            ulong x11 = (Codec.Decode24(n[ 39..]) << 4);        // x11:28/--
-            ulong x12 =  Codec.Decode32(n[ 42..]);              // x12:32/--
-            ulong x13 = (Codec.Decode24(n[ 46..]) << 4);        // x13:28/--
-            ulong x14 =  Codec.Decode32(n[ 49..]);              // x14:32/--
-            ulong x15 = (Codec.Decode24(n[ 53..]) << 4);        // x15:28/--
-            ulong x16 =  Codec.Decode32(n[ 56..]);              // x16:32/--
-            ulong x17 = (Codec.Decode24(n[ 60..]) << 4);        // x17:28/--
-            ulong x18 =  Codec.Decode32(n[ 63..]);              // x18:32/--
-            ulong x19 = (Codec.Decode24(n[ 67..]) << 4);        // x19:28/--
-            ulong x20 =  Codec.Decode32(n[ 70..]);              // x20:32/--
-            ulong x21 = (Codec.Decode24(n[ 74..]) << 4);        // x21:28/--
-            ulong x22 =  Codec.Decode32(n[ 77..]);              // x22:32/--
-            ulong x23 = (Codec.Decode24(n[ 81..]) << 4);        // x23:28/--
-            ulong x24 =  Codec.Decode32(n[ 84..]);              // x24:32/--
-            ulong x25 = (Codec.Decode24(n[ 88..]) << 4);        // x25:28/--
-            ulong x26 =  Codec.Decode32(n[ 91..]);              // x26:32/--
-            ulong x27 = (Codec.Decode24(n[ 95..]) << 4);        // x27:28/--
-            ulong x28 =  Codec.Decode32(n[ 98..]);              // x28:32/--
-            ulong x29 = (Codec.Decode24(n[102..]) << 4);        // x29:28/--
-            ulong x30 =  Codec.Decode32(n[105..]);              // x30:32/--
-            ulong x31 = (Codec.Decode24(n[109..]) << 4);        // x31:28/--
-            ulong x32 =  Codec.Decode16(n[112..]);              // x32:16/--
-
-            //x32 += (x31 >> 28); x31 &= M28UL;
-            x16 += x32 * L4_0;                          // x16:42/--
-            x17 += x32 * L4_1;                          // x17:41/28
-            x18 += x32 * L4_2;                          // x18:43/42
-            x19 += x32 * L4_3;                          // x19:44/28
-            x20 += x32 * L4_4;                          // x20:43/--
-            x21 += x32 * L4_5;                          // x21:44/28
-            x22 += x32 * L4_6;                          // x22:43/41
-            x23 += x32 * L4_7;                          // x23:45/41
-
-            x31 += (x30 >> 28); x30 &= M28UL;           // x31:28/--, x30:28/--
-            x15 += x31 * L4_0;                          // x15:54/--
-            x16 += x31 * L4_1;                          // x16:53/42
-            x17 += x31 * L4_2;                          // x17:55/54
-            x18 += x31 * L4_3;                          // x18:56/44
-            x19 += x31 * L4_4;                          // x19:55/--
-            x20 += x31 * L4_5;                          // x20:56/43
-            x21 += x31 * L4_6;                          // x21:55/53
-            x22 += x31 * L4_7;                          // x22:57/53
-
-            //x30 += (x29 >> 28); x29 &= M28UL;
-            x14 += x30 * L4_0;                          // x14:54/--
-            x15 += x30 * L4_1;                          // x15:54/53
-            x16 += x30 * L4_2;                          // x16:56/--
-            x17 += x30 * L4_3;                          // x17:57/--
-            x18 += x30 * L4_4;                          // x18:56/55
-            x19 += x30 * L4_5;                          // x19:56/55
-            x20 += x30 * L4_6;                          // x20:57/--
-            x21 += x30 * L4_7;                          // x21:57/56
-
-            x29 += (x28 >> 28); x28 &= M28UL;           // x29:28/--, x28:28/--
-            x13 += x29 * L4_0;                          // x13:54/--
-            x14 += x29 * L4_1;                          // x14:54/53
-            x15 += x29 * L4_2;                          // x15:56/--
-            x16 += x29 * L4_3;                          // x16:57/--
-            x17 += x29 * L4_4;                          // x17:57/55
-            x18 += x29 * L4_5;                          // x18:57/55
-            x19 += x29 * L4_6;                          // x19:57/52
-            x20 += x29 * L4_7;                          // x20:58/52
-
-            //x28 += (x27 >> 28); x27 &= M28UL;
-            x12 += x28 * L4_0;                          // x12:54/--
-            x13 += x28 * L4_1;                          // x13:54/53
-            x14 += x28 * L4_2;                          // x14:56/--
-            x15 += x28 * L4_3;                          // x15:57/--
-            x16 += x28 * L4_4;                          // x16:57/55
-            x17 += x28 * L4_5;                          // x17:58/--
-            x18 += x28 * L4_6;                          // x18:58/--
-            x19 += x28 * L4_7;                          // x19:58/53
-
-            x27 += (x26 >> 28); x26 &= M28UL;           // x27:28/--, x26:28/--
-            x11 += x27 * L4_0;                          // x11:54/--
-            x12 += x27 * L4_1;                          // x12:54/53
-            x13 += x27 * L4_2;                          // x13:56/--
-            x14 += x27 * L4_3;                          // x14:57/--
-            x15 += x27 * L4_4;                          // x15:57/55
-            x16 += x27 * L4_5;                          // x16:58/--
-            x17 += x27 * L4_6;                          // x17:58/56
-            x18 += x27 * L4_7;                          // x18:59/--
-
-            //x26 += (x25 >> 28); x25 &= M28UL;
-            x10 += x26 * L4_0;                          // x10:54/--
-            x11 += x26 * L4_1;                          // x11:54/53
-            x12 += x26 * L4_2;                          // x12:56/--
-            x13 += x26 * L4_3;                          // x13:57/--
-            x14 += x26 * L4_4;                          // x14:57/55
-            x15 += x26 * L4_5;                          // x15:58/--
-            x16 += x26 * L4_6;                          // x16:58/56
-            x17 += x26 * L4_7;                          // x17:59/--
-
-            x25 += (x24 >> 28); x24 &= M28UL;           // x25:28/--, x24:28/--
-            x09 += x25 * L4_0;                          // x09:54/--
-            x10 += x25 * L4_1;                          // x10:54/53
-            x11 += x25 * L4_2;                          // x11:56/--
-            x12 += x25 * L4_3;                          // x12:57/--
-            x13 += x25 * L4_4;                          // x13:57/55
-            x14 += x25 * L4_5;                          // x14:58/--
-            x15 += x25 * L4_6;                          // x15:58/56
-            x16 += x25 * L4_7;                          // x16:59/--
-
-            x21 += (x20 >> 28); x20 &= M28UL;           // x21:58/--, x20:28/--
-            x22 += (x21 >> 28); x21 &= M28UL;           // x22:57/54, x21:28/--
-            x23 += (x22 >> 28); x22 &= M28UL;           // x23:45/42, x22:28/--
-            x24 += (x23 >> 28); x23 &= M28UL;           // x24:28/18, x23:28/--
-
-            x08 += x24 * L4_0;                          // x08:54/--
-            x09 += x24 * L4_1;                          // x09:55/--
-            x10 += x24 * L4_2;                          // x10:56/46
-            x11 += x24 * L4_3;                          // x11:57/46
-            x12 += x24 * L4_4;                          // x12:57/55
-            x13 += x24 * L4_5;                          // x13:58/--
-            x14 += x24 * L4_6;                          // x14:58/56
-            x15 += x24 * L4_7;                          // x15:59/--
-
-            x07 += x23 * L4_0;                          // x07:54/--
-            x08 += x23 * L4_1;                          // x08:54/53
-            x09 += x23 * L4_2;                          // x09:56/53
-            x10 += x23 * L4_3;                          // x10:57/46
-            x11 += x23 * L4_4;                          // x11:57/55
-            x12 += x23 * L4_5;                          // x12:58/--
-            x13 += x23 * L4_6;                          // x13:58/56
-            x14 += x23 * L4_7;                          // x14:59/--
-
-            x06 += x22 * L4_0;                          // x06:54/--
-            x07 += x22 * L4_1;                          // x07:54/53
-            x08 += x22 * L4_2;                          // x08:56/--
-            x09 += x22 * L4_3;                          // x09:57/53
-            x10 += x22 * L4_4;                          // x10:57/55
-            x11 += x22 * L4_5;                          // x11:58/--
-            x12 += x22 * L4_6;                          // x12:58/56
-            x13 += x22 * L4_7;                          // x13:59/--
-
-            x18 += (x17 >> 28); x17 &= M28UL;           // x18:59/31, x17:28/--
-            x19 += (x18 >> 28); x18 &= M28UL;           // x19:58/54, x18:28/--
-            x20 += (x19 >> 28); x19 &= M28UL;           // x20:30/29, x19:28/--
-            x21 += (x20 >> 28); x20 &= M28UL;           // x21:28/03, x20:28/--
-
-            x05 += x21 * L4_0;                          // x05:54/--
-            x06 += x21 * L4_1;                          // x06:55/--
-            x07 += x21 * L4_2;                          // x07:56/31
-            x08 += x21 * L4_3;                          // x08:57/31
-            x09 += x21 * L4_4;                          // x09:57/56
-            x10 += x21 * L4_5;                          // x10:58/--
-            x11 += x21 * L4_6;                          // x11:58/56
-            x12 += x21 * L4_7;                          // x12:59/--
-
-            x04 += x20 * L4_0;                          // x04:54/--
-            x05 += x20 * L4_1;                          // x05:54/53
-            x06 += x20 * L4_2;                          // x06:56/53
-            x07 += x20 * L4_3;                          // x07:57/31
-            x08 += x20 * L4_4;                          // x08:57/55
-            x09 += x20 * L4_5;                          // x09:58/--
-            x10 += x20 * L4_6;                          // x10:58/56
-            x11 += x20 * L4_7;                          // x11:59/--
-
-            x03 += x19 * L4_0;                          // x03:54/--
-            x04 += x19 * L4_1;                          // x04:54/53
-            x05 += x19 * L4_2;                          // x05:56/--
-            x06 += x19 * L4_3;                          // x06:57/53
-            x07 += x19 * L4_4;                          // x07:57/55
-            x08 += x19 * L4_5;                          // x08:58/--
-            x09 += x19 * L4_6;                          // x09:58/56
-            x10 += x19 * L4_7;                          // x10:59/--
-
-            x15 += (x14 >> 28); x14 &= M28UL;           // x15:59/31, x14:28/--
-            x16 += (x15 >> 28); x15 &= M28UL;           // x16:59/32, x15:28/--
-            x17 += (x16 >> 28); x16 &= M28UL;           // x17:31/29, x16:28/--
-            x18 += (x17 >> 28); x17 &= M28UL;           // x18:28/04, x17:28/--
-
-            x02 += x18 * L4_0;                          // x02:54/--
-            x03 += x18 * L4_1;                          // x03:55/--
-            x04 += x18 * L4_2;                          // x04:56/32
-            x05 += x18 * L4_3;                          // x05:57/32
-            x06 += x18 * L4_4;                          // x06:57/56
-            x07 += x18 * L4_5;                          // x07:58/--
-            x08 += x18 * L4_6;                          // x08:58/56
-            x09 += x18 * L4_7;                          // x09:59/--
-
-            x01 += x17 * L4_0;                          // x01:54/--
-            x02 += x17 * L4_1;                          // x02:54/53
-            x03 += x17 * L4_2;                          // x03:56/53
-            x04 += x17 * L4_3;                          // x04:57/32
-            x05 += x17 * L4_4;                          // x05:57/55
-            x06 += x17 * L4_5;                          // x06:58/--
-            x07 += x17 * L4_6;                          // x07:58/56
-            x08 += x17 * L4_7;                          // x08:59/--
-
-            x16 *= 4;
-            x16 += (x15 >> 26); x15 &= M26UL;
-            x16 += 1;                                   // x16:30/01
-
-            x00 += x16 * L_0;
-            x01 += x16 * L_1;
-            x02 += x16 * L_2;
-            x03 += x16 * L_3;
-            x04 += x16 * L_4;
-            x05 += x16 * L_5;
-            x06 += x16 * L_6;
-            x07 += x16 * L_7;
-
-            x01 += (x00 >> 28); x00 &= M28UL;
-            x02 += (x01 >> 28); x01 &= M28UL;
-            x03 += (x02 >> 28); x02 &= M28UL;
-            x04 += (x03 >> 28); x03 &= M28UL;
-            x05 += (x04 >> 28); x04 &= M28UL;
-            x06 += (x05 >> 28); x05 &= M28UL;
-            x07 += (x06 >> 28); x06 &= M28UL;
-            x08 += (x07 >> 28); x07 &= M28UL;
-            x09 += (x08 >> 28); x08 &= M28UL;
-            x10 += (x09 >> 28); x09 &= M28UL;
-            x11 += (x10 >> 28); x10 &= M28UL;
-            x12 += (x11 >> 28); x11 &= M28UL;
-            x13 += (x12 >> 28); x12 &= M28UL;
-            x14 += (x13 >> 28); x13 &= M28UL;
-            x15 += (x14 >> 28); x14 &= M28UL;
-            x16  = (x15 >> 26); x15 &= M26UL;
-
-            x16 -= 1;
-
-            Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue);
-
-            x00 -= x16 & L_0;
-            x01 -= x16 & L_1;
-            x02 -= x16 & L_2;
-            x03 -= x16 & L_3;
-            x04 -= x16 & L_4;
-            x05 -= x16 & L_5;
-            x06 -= x16 & L_6;
-            x07 -= x16 & L_7;
-
-            x01 += (ulong)((long)x00 >> 28); x00 &= M28UL;
-            x02 += (ulong)((long)x01 >> 28); x01 &= M28UL;
-            x03 += (ulong)((long)x02 >> 28); x02 &= M28UL;
-            x04 += (ulong)((long)x03 >> 28); x03 &= M28UL;
-            x05 += (ulong)((long)x04 >> 28); x04 &= M28UL;
-            x06 += (ulong)((long)x05 >> 28); x05 &= M28UL;
-            x07 += (ulong)((long)x06 >> 28); x06 &= M28UL;
-            x08 += (ulong)((long)x07 >> 28); x07 &= M28UL;
-            x09 += (ulong)((long)x08 >> 28); x08 &= M28UL;
-            x10 += (ulong)((long)x09 >> 28); x09 &= M28UL;
-            x11 += (ulong)((long)x10 >> 28); x10 &= M28UL;
-            x12 += (ulong)((long)x11 >> 28); x11 &= M28UL;
-            x13 += (ulong)((long)x12 >> 28); x12 &= M28UL;
-            x14 += (ulong)((long)x13 >> 28); x13 &= M28UL;
-            x15 += (ulong)((long)x14 >> 28); x14 &= M28UL;
-
-            Debug.Assert(x15 >> 26 == 0UL);
-
-            Codec.Encode56(x00 | (x01 << 28), r);
-            Codec.Encode56(x02 | (x03 << 28), r[7..]);
-            Codec.Encode56(x04 | (x05 << 28), r[14..]);
-            Codec.Encode56(x06 | (x07 << 28), r[21..]);
-            Codec.Encode56(x08 | (x09 << 28), r[28..]);
-            Codec.Encode56(x10 | (x11 << 28), r[35..]);
-            Codec.Encode56(x12 | (x13 << 28), r[42..]);
-            Codec.Encode56(x14 | (x15 << 28), r[49..]);
-            r[ScalarBytes - 1] = 0;
-        }
-#endif
-
         private static void ScalarMult(byte[] k, ref PointProjective p, ref PointProjective r)
+#endif
         {
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ScalarMult(k.AsSpan(), ref p, ref r);
+            Span<uint> n = stackalloc uint[ScalarUints + 1];
 #else
-            uint[] n = new uint[ScalarUints];
-            DecodeScalar(k, 0, n);
-
-            // Recode the scalar into signed-digit form
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, c1);             Debug.Assert(c2 == (1U << 31));
-
-                // NOTE: Bit 448 is implicitly set after the signed-digit recoding
-            }
-
-            uint[] table = PointPrecompute(ref p, 8);
-            Init(out PointProjective q);
-
-            // Replace first 4 doublings (2^4 * P) with 1 addition (P + 15 * P)
-            PointLookup15(table, ref r);
-            PointAdd(ref p, ref r);
-
-            int w = 111;
-            for (;;)
-            {
-                PointLookup(n, w, table, ref q);
-                PointAdd(ref q, ref r);
-
-                if (--w < 0)
-                    break;
-
-                for (int i = 0; i < 4; ++i)
-                {
-                    PointDouble(ref r);
-                }
-            }
+            uint[] n = new uint[ScalarUints + 1];
 #endif
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMult(ReadOnlySpan<byte> k, ref PointProjective p, ref PointProjective r)
-        {
-            Span<uint> n = stackalloc uint[ScalarUints];
-            DecodeScalar(k, n);
 
-            // Recode the scalar into signed-digit form
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, c1);             Debug.Assert(c2 == (1U << 31));
+            Scalar448.Decode(k, n);
+            Scalar448.ToSignedDigits(449, n, n);
 
-                // NOTE: Bit 448 is implicitly set after the signed-digit recoding
-            }
+            // NOTE: Bit 448 is handled explicitly by an initial addition
+            Debug.Assert(n[ScalarUints] == 1U);
 
             uint[] table = PointPrecompute(ref p, 8);
             Init(out PointProjective q);
@@ -1748,76 +1153,12 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 }
             }
         }
-#endif
 
-        private static void ScalarMultBase(byte[] k, ref PointProjective r)
-        {
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ScalarMultBase(k.AsSpan(), ref r);
+        private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointProjective r)
 #else
-            // Equivalent (but much slower)
-            //Init(out PointProjective p);
-            //F.Copy(B_x, 0, p.x, 0);
-            //F.Copy(B_y, 0, p.y, 0);
-            //F.One(p.z);
-            //ScalarMult(k, ref p, ref r);
-
-            Precompute();
-
-            uint[] n = new uint[ScalarUints + 1];
-            DecodeScalar(k, 0, n);
-
-            // Recode the scalar into signed-digit form
-            {
-                n[ScalarUints] = (1U << (PrecompRange - 448))
-                               + Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);
-                uint c = Nat.ShiftDownBit(n.Length, n, 0);
-                Debug.Assert(c == (1U << 31));
-            }
-
-            Init(out PointAffine p);
-
-            PointSetNeutral(ref r);
-
-            int cOff = PrecompSpacing - 1;
-            for (;;)
-            {
-                int tPos = cOff;
-
-                for (int b = 0; b < PrecompBlocks; ++b)
-                {
-                    uint w = 0;
-                    for (int t = 0; t < PrecompTeeth; ++t)
-                    {
-                        uint tBit = n[tPos >> 5] >> (tPos & 0x1F);
-                        w &= ~(1U << t);
-                        w ^= (tBit << t);
-                        tPos += PrecompSpacing;
-                    }
-
-                    int sign = (int)(w >> (PrecompTeeth - 1)) & 1;
-                    int abs = ((int)w ^ -sign) & PrecompMask;
-
-                    Debug.Assert(sign == 0 || sign == 1);
-                    Debug.Assert(0 <= abs && abs < PrecompPoints);
-
-                    PointLookup(b, abs, ref p);
-
-                    F.CNegate(sign, p.x);
-
-                    PointAdd(ref p, ref r);
-                }
-
-                if (--cOff < 0)
-                    break;
-
-                PointDouble(ref r);
-            }
+        private static void ScalarMultBase(byte[] k, ref PointProjective r)
 #endif
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointProjective r)
         {
             // Equivalent (but much slower)
             //Init(out PointProjective p);
@@ -1828,16 +1169,14 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             Precompute();
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
             Span<uint> n = stackalloc uint[ScalarUints + 1];
-            DecodeScalar(k, n);
+#else
+            uint[] n = new uint[ScalarUints + 1];
+#endif
 
-            // Recode the scalar into signed-digit form
-            {
-                n[ScalarUints] = (1U << (PrecompRange - 448))
-                               + Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);
-                uint c = Nat.ShiftDownBit(n.Length, n, 0);
-                Debug.Assert(c == (1U << 31));
-            }
+            Scalar448.Decode(k, n);
+            Scalar448.ToSignedDigits(PrecompRange, n, n);
 
             Init(out PointAffine p);
 
@@ -1878,7 +1217,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 PointDouble(ref r);
             }
         }
-#endif
 
         private static void ScalarMultBaseEncoded(byte[] k, byte[] r, int rOff)
         {
@@ -1945,9 +1283,10 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #else
             sbyte[] ws_p = new sbyte[447];
 #endif
-            Wnaf.GetSignedVar(L, WnafWidth, ws_p);
+            // NOTE: WnafWidth225 because of the special structure of the order 
+            Scalar448.GetOrderWnafVar(WnafWidth225, ws_p);
 
-            int count = 1 << (WnafWidth - 2);
+            int count = 1 << (WnafWidth225 - 2);
             PointProjective[] tp = new PointProjective[count];
             PointPrecomputeVar(ref p, tp, 0, count);
 
@@ -1970,38 +1309,46 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMultStrausVar(ReadOnlySpan<uint> nb, ReadOnlySpan<uint> np, ref PointProjective p,
-            ref PointProjective r)
+        private static void ScalarMultStraus225Var(ReadOnlySpan<uint> nb, ReadOnlySpan<uint> np, ref PointProjective p,
+            ReadOnlySpan<uint> nq, ref PointProjective q, ref PointProjective r)
 #else
-        private static void ScalarMultStrausVar(uint[] nb, uint[] np, ref PointProjective p, ref PointProjective r)
+        private static void ScalarMultStraus225Var(uint[] nb, uint[] np, ref PointProjective p, uint[] nq,
+            ref PointProjective q, ref PointProjective r)
 #endif
         {
             Debug.Assert(nb.Length == ScalarUints);
-            Debug.Assert(nb[ScalarUints - 1] <= L[ScalarUints - 1]);
-
-            Debug.Assert(np.Length == ScalarUints);
-            Debug.Assert(np[ScalarUints - 1] <= L[ScalarUints - 1]);
+            Debug.Assert((int)nb[ScalarUints - 1] >= 0);
+            Debug.Assert(np.Length == 8);
+            Debug.Assert((int)np[7] >> 31 == (int)np[7] >> 1);
+            Debug.Assert(nq.Length == 8);
+            Debug.Assert((int)nq[7] >> 31 == (int)nq[7] >> 1);
 
             Precompute();
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            Span<sbyte> ws_b = stackalloc sbyte[447];
-            Span<sbyte> ws_p = stackalloc sbyte[447];
+            Span<sbyte> ws_b = stackalloc sbyte[450];
+            Span<sbyte> ws_p = stackalloc sbyte[225];
+            Span<sbyte> ws_q = stackalloc sbyte[225];
 #else
-            sbyte[] ws_b = new sbyte[447];
-            sbyte[] ws_p = new sbyte[447];
+            sbyte[] ws_b = new sbyte[450];
+            sbyte[] ws_p = new sbyte[225];
+            sbyte[] ws_q = new sbyte[225];
 #endif
 
             Wnaf.GetSignedVar(nb, WnafWidthBase, ws_b);
-            Wnaf.GetSignedVar(np, WnafWidth, ws_p);
+            Wnaf.GetSignedVar(np, WnafWidth225, ws_p);
+            Wnaf.GetSignedVar(nq, WnafWidth225, ws_q);
 
-            int count = 1 << (WnafWidth - 2);
+            int count = 1 << (WnafWidth225 - 2);
             PointProjective[] tp = new PointProjective[count];
+            PointProjective[] tq = new PointProjective[count];
             PointPrecomputeVar(ref p, tp, 0, count);
+            PointPrecomputeVar(ref q, tq, 0, count);
 
             PointSetNeutral(ref r);
 
-            for (int bit = 446;;)
+            int bit = 225;
+            while (--bit >= 0)
             {
                 int wb = ws_b[bit];
                 if (wb != 0)
@@ -2010,6 +1357,13 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                     PointAddVar(wb < 0, ref PrecompBaseWnaf[index], ref r);
                 }
 
+                int wb225 = ws_b[225 + bit];
+                if (wb225 != 0)
+                {
+                    int index = (wb225 >> 1) ^ (wb225 >> 31);
+                    PointAddVar(wb225 < 0, ref PrecompBase225Wnaf[index], ref r);
+                }
+
                 int wp = ws_p[bit];
                 if (wp != 0)
                 {
@@ -2017,11 +1371,18 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                     PointAddVar(wp < 0, ref tp[index], ref r);
                 }
 
-                if (--bit < 0)
-                    break;
+                int wq = ws_q[bit];
+                if (wq != 0)
+                {
+                    int index = (wq >> 1) ^ (wq >> 31);
+                    PointAddVar(wq < 0, ref tq[index], ref r);
+                }
 
                 PointDouble(ref r);
             }
+
+            // NOTE: Together with the final PointDouble of the loop, this clears the cofactor of 4
+            PointDouble(ref r);
         }
 
         public static void Sign(byte[] sk, int skOff, byte[] ctx, byte[] m, int mOff, int mLen, byte[] sig, int sigOff)
@@ -2076,24 +1437,44 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         public static bool ValidatePublicKeyFull(byte[] pk, int pkOff)
         {
-            Init(out PointProjective p);
-            if (!DecodePointVar(pk, pkOff, false, ref p))
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
+#else
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
+#endif
+
+            if (!CheckPointFullVar(A))
                 return false;
 
-            Init(out PointProjective r);
-            ScalarMultOrderVar(ref p, ref r);
+            Init(out PointProjective pA);
+            if (!DecodePointVar(A, false, ref pA))
+                return false;
 
-            F.Normalize(r.x);
-            F.Normalize(r.y);
-            F.Normalize(r.z);
+            Init(out PointProjective pR);
+            ScalarMultOrderVar(ref pA, ref pR);
 
-            return IsNeutralElementVar(r.x, r.y, r.z);
+            F.Normalize(pR.x);
+            F.Normalize(pR.y);
+            F.Normalize(pR.z);
+
+            return IsNeutralElementVar(pR.x, pR.y, pR.z);
         }
 
         public static bool ValidatePublicKeyPartial(byte[] pk, int pkOff)
         {
-            Init(out PointProjective p);
-            return DecodePointVar(pk, pkOff, false, ref p);
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
+#else
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
+#endif
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointProjective pA);
+            return DecodePointVar(A, false, ref pA);
         }
 
         public static bool Verify(byte[] sig, int sigOff, byte[] pk, int pkOff, byte[] ctx, byte[] m, int mOff, int mLen)
diff --git a/crypto/src/math/ec/rfc8032/Scalar25519.cs b/crypto/src/math/ec/rfc8032/Scalar25519.cs
index 738ce63cb..0a443abaa 100644
--- a/crypto/src/math/ec/rfc8032/Scalar25519.cs
+++ b/crypto/src/math/ec/rfc8032/Scalar25519.cs
@@ -74,7 +74,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             Span<uint> tt = stackalloc uint[16];
             Nat.Mul(y128, x, tt);
 
-            if ((y128[3] >> 31) != 0)
+            if ((int)y128[3] < 0)
             {
                 Nat.AddTo(8, L, tt[4..]);
                 Nat.SubFrom(8, x, tt[4..]);
@@ -90,7 +90,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             uint[] tt = new uint[12];
             Nat.Mul(y128, 0, 4, x, 0, 8, tt, 0);
 
-            if ((y128[3] >> 31) != 0)
+            if ((int)y128[3] < 0)
             {
                 Nat256.AddTo(L, 0, tt, 4, 0U);
                 Nat256.SubFrom(x, 0, tt, 4);
@@ -100,7 +100,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             Codec.Encode32(tt, 0, 12, bytes, 0);
 
             byte[] r = Reduce(bytes);
-            Codec.Decode32(r, 0, z, 0, 8);
+            Decode(r, z);
         }
 #endif
 
@@ -400,33 +400,33 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             Span<uint> v1 = stackalloc uint[4];     v1[0] = 1U;
 
             int last = 15;
-            int len_Nv = GetBitLengthPositive(last, Nv);
+            int len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
 
             while (len_Nv > TargetLength)
             {
-                int len_p = GetBitLength(last, p);
+                int len_p = ScalarUtilities.GetBitLength(last, p);
                 int s = len_p - len_Nv;
                 s &= ~(s >> 31);
 
                 if ((int)p[last] < 0)
                 {
-                    AddShifted_NP(last, s, Nu, Nv, p);
-                    AddShifted_UV(3, s, u0, u1, v0, v1);
+                    ScalarUtilities.AddShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.AddShifted_UV(last: 3, s, u0, u1, v0, v1);
                 }
                 else
                 {
-                    SubShifted_NP(last, s, Nu, Nv, p);
-                    SubShifted_UV(3, s, u0, u1, v0, v1);
+                    ScalarUtilities.SubShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.SubShifted_UV(last: 3, s, u0, u1, v0, v1);
                 }
 
-                if (LessThan(last, Nu, Nv))
+                if (ScalarUtilities.LessThan(last, Nu, Nv))
                 {
-                    Swap(ref u0, ref v0);
-                    Swap(ref u1, ref v1);
-                    Swap(ref Nu, ref Nv);
+                    ScalarUtilities.Swap(ref u0, ref v0);
+                    ScalarUtilities.Swap(ref u1, ref v1);
+                    ScalarUtilities.Swap(ref Nu, ref Nv);
 
                     last = len_Nv >> 5;
-                    len_Nv = GetBitLengthPositive(last, Nv);
+                    len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
                 }
             }
 
@@ -452,33 +452,33 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             uint[] v1 = new uint[4];        v1[0] = 1U;
 
             int last = 15;
-            int len_Nv = GetBitLengthPositive(last, Nv);
+            int len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
 
             while (len_Nv > TargetLength)
             {
-                int len_p = GetBitLength(last, p);
+                int len_p = ScalarUtilities.GetBitLength(last, p);
                 int s = len_p - len_Nv;
                 s &= ~(s >> 31);
 
                 if ((int)p[last] < 0)
                 {
-                    AddShifted_NP(last, s, Nu, Nv, p);
-                    AddShifted_UV(3, s, u0, u1, v0, v1);
+                    ScalarUtilities.AddShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.AddShifted_UV(last: 3, s, u0, u1, v0, v1);
                 }
                 else
                 {
-                    SubShifted_NP(last, s, Nu, Nv, p);
-                    SubShifted_UV(3, s, u0, u1, v0, v1);
+                    ScalarUtilities.SubShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.SubShifted_UV(last: 3, s, u0, u1, v0, v1);
                 }
 
-                if (LessThan(last, Nu, Nv))
+                if (ScalarUtilities.LessThan(last, Nu, Nv))
                 {
-                    Swap(ref u0, ref v0);
-                    Swap(ref u1, ref v1);
-                    Swap(ref Nu, ref Nv);
+                    ScalarUtilities.Swap(ref u0, ref v0);
+                    ScalarUtilities.Swap(ref u1, ref v1);
+                    ScalarUtilities.Swap(ref Nu, ref Nv);
 
                     last = len_Nv >> 5;
-                    len_Nv = GetBitLengthPositive(last, Nv);
+                    len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
                 }
             }
 
@@ -489,294 +489,16 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #endif
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        internal static void ToSignedDigits(ReadOnlySpan<uint> x, Span<uint> z)
+        internal static void ToSignedDigits(int bits, ReadOnlySpan<uint> x, Span<uint> z)
 #else
-        internal static void ToSignedDigits(uint[] x, uint[] z)
+        internal static void ToSignedDigits(int bits, uint[] x, uint[] z)
 #endif
         {
+            Debug.Assert(bits == 256);
+            Debug.Assert(z.Length >= Size);
+
             uint c1 = Nat.CAdd(Size, ~(int)x[0] & 1, x, L, z);  Debug.Assert(c1 == 0U);
             uint c2 = Nat.ShiftDownBit(Size, z, 1U);            Debug.Assert(c2 == (1U << 31));
         }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void AddShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
-#else
-        private static void AddShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
-#endif
-        {
-            int sWords = s >> 5, sBits = s & 31;
-
-            ulong cc__p = 0UL;
-            ulong cc_Nu = 0UL;
-
-            if (sBits == 0)
-            {
-                for (int i = sWords; i <= last; ++i)
-                {
-                    cc_Nu += Nu[i];
-                    cc_Nu += _p[i - sWords];
-
-                    cc__p += _p[i];
-                    cc__p += Nv[i - sWords];
-                    _p[i]  = (uint)cc__p; cc__p >>= 32;
-
-                    cc_Nu += _p[i - sWords];
-                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
-                }
-            }
-            else
-            {
-                uint prev_p = 0U;
-                uint prev_q = 0U;
-                uint prev_v = 0U;
-
-                for (int i = sWords; i <= last; ++i)
-                {
-                    uint next_p = _p[i - sWords];
-                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
-                    prev_p = next_p;
-
-                    cc_Nu += Nu[i];
-                    cc_Nu += p_s;
-
-                    uint next_v = Nv[i - sWords];
-                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
-                    prev_v = next_v;
-
-                    cc__p += _p[i];
-                    cc__p += v_s;
-                    _p[i]  = (uint)cc__p; cc__p >>= 32;
-
-                    uint next_q = _p[i - sWords];
-                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
-                    prev_q = next_q;
-
-                    cc_Nu += q_s;
-                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
-                }
-            }
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void AddShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
-            ReadOnlySpan<uint> v1)
-#else
-        private static void AddShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
-#endif
-        {
-            int sWords = s >> 5, sBits = s & 31;
-
-            ulong cc_u0 = 0UL;
-            ulong cc_u1 = 0UL;
-
-            if (sBits == 0)
-            {
-                for (int i = sWords; i <= last; ++i)
-                {
-                    cc_u0 += u0[i];
-                    cc_u1 += u1[i];
-                    cc_u0 += v0[i - sWords];
-                    cc_u1 += v1[i - sWords];
-                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
-                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
-                }
-            }
-            else
-            {
-                uint prev_v0 = 0U;
-                uint prev_v1 = 0U;
-
-                for (int i = sWords; i <= last; ++i)
-                {
-                    uint next_v0 = v0[i - sWords];
-                    uint next_v1 = v1[i - sWords];
-                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
-                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
-                    prev_v0 = next_v0;
-                    prev_v1 = next_v1;
-
-                    cc_u0 += u0[i];
-                    cc_u1 += u1[i];
-                    cc_u0 += v0_s;
-                    cc_u1 += v1_s;
-                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
-                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
-                }
-            }
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int GetBitLength(int last, ReadOnlySpan<uint> x)
-#else
-        private static int GetBitLength(int last, uint[] x)
-#endif
-        {
-            int i = last;
-            uint sign = (uint)((int)x[i] >> 31);
-            while (i > 0 && x[i] == sign)
-            {
-                --i;
-            }
-            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)(x[i] ^ sign));
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int GetBitLengthPositive(int last, ReadOnlySpan<uint> x)
-#else
-        private static int GetBitLengthPositive(int last, uint[] x)
-#endif
-        {
-            int i = last;
-            while (i > 0 && x[i] == 0)
-            {
-                --i;
-            }
-            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)x[i]);
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool LessThan(int last, ReadOnlySpan<uint> x, ReadOnlySpan<uint> y)
-#else
-        private static bool LessThan(int last, uint[] x, uint[] y)
-#endif
-        {
-            int i = last;
-            if ((int)x[i] < (int)y[i])
-                return true;
-            if ((int)x[i] > (int)y[i])
-                return false;
-            while (--i >= 0)
-            {
-                if (x[i] < y[i])
-                    return true;
-                if (x[i] > y[i])
-                    return false;
-            }
-            return false;
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void SubShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
-#else
-        private static void SubShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
-#endif
-        {
-            int sWords = s >> 5, sBits = s & 31;
-
-            long cc__p = 0L;
-            long cc_Nu = 0L;
-
-            if (sBits == 0)
-            {
-                for (int i = sWords; i <= last; ++i)
-                {
-                    cc_Nu += Nu[i];
-                    cc_Nu -= _p[i - sWords];
-
-                    cc__p += _p[i];
-                    cc__p -= Nv[i - sWords];
-                    _p[i]  = (uint)cc__p; cc__p >>= 32;
-
-                    cc_Nu -= _p[i - sWords];
-                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
-                }
-            }
-            else
-            {
-                uint prev_p = 0U;
-                uint prev_q = 0U;
-                uint prev_v = 0U;
-
-                for (int i = sWords; i <= last; ++i)
-                {
-                    uint next_p = _p[i - sWords];
-                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
-                    prev_p = next_p;
-
-                    cc_Nu += Nu[i];
-                    cc_Nu -= p_s;
-
-                    uint next_v = Nv[i - sWords];
-                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
-                    prev_v = next_v;
-
-                    cc__p += _p[i];
-                    cc__p -= v_s;
-                    _p[i]  = (uint)cc__p; cc__p >>= 32;
-
-                    uint next_q = _p[i - sWords];
-                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
-                    prev_q = next_q;
-
-                    cc_Nu -= q_s;
-                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
-                }
-            }
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void SubShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
-            ReadOnlySpan<uint> v1)
-#else
-        private static void SubShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
-#endif
-        {
-            int sWords = s >> 5, sBits = s & 31;
-
-            long cc_u0 = 0L;
-            long cc_u1 = 0L;
-
-            if (sBits == 0)
-            {
-                for (int i = sWords; i <= last; ++i)
-                {
-                    cc_u0 += u0[i];
-                    cc_u1 += u1[i];
-                    cc_u0 -= v0[i - sWords];
-                    cc_u1 -= v1[i - sWords];
-                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
-                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
-                }
-            }
-            else
-            {
-                uint prev_v0 = 0U;
-                uint prev_v1 = 0U;
-
-                for (int i = sWords; i <= last; ++i)
-                {
-                    uint next_v0 = v0[i - sWords];
-                    uint next_v1 = v1[i - sWords];
-                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
-                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
-                    prev_v0 = next_v0;
-                    prev_v1 = next_v1;
-
-                    cc_u0 += u0[i];
-                    cc_u1 += u1[i];
-                    cc_u0 -= v0_s;
-                    cc_u1 -= v1_s;
-                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
-                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
-                }
-            }
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void Swap(ref Span<uint> x, ref Span<uint> y)
-#else
-        private static void Swap(ref uint[] x, ref uint[] y)
-#endif
-        {
-            var t = x; x = y; y = t;
-        }
     }
 }
diff --git a/crypto/src/math/ec/rfc8032/Scalar448.cs b/crypto/src/math/ec/rfc8032/Scalar448.cs
new file mode 100644
index 000000000..e17f48a99
--- /dev/null
+++ b/crypto/src/math/ec/rfc8032/Scalar448.cs
@@ -0,0 +1,819 @@
+using System;
+using System.Diagnostics;
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#endif
+
+using Org.BouncyCastle.Crypto.Utilities;
+using Org.BouncyCastle.Math.Raw;
+using Org.BouncyCastle.Utilities;
+
+namespace Org.BouncyCastle.Math.EC.Rfc8032
+{
+    internal static class Scalar448
+    {
+        internal const int Size = 14;
+
+        internal const int ScalarBytes = Size * 4 + 1;
+
+        private const ulong M26UL = 0x03FFFFFFUL;
+        private const ulong M28UL = 0x0FFFFFFFUL;
+
+        private const int TargetLength = 447;
+
+        private static readonly uint[] L = { 0xAB5844F3U, 0x2378C292U, 0x8DC58F55U, 0x216CC272U, 0xAED63690U,
+            0xC44EDB49U, 0x7CCA23E9U, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU,
+            0x3FFFFFFFU };
+        private static readonly uint[] LSq = { 0x1BA1FEA9U, 0xC1ADFBB8U, 0x49E0A8B2U, 0xB91BF537U, 0xE764D815U,
+            0x4525492BU, 0xA2B8716DU, 0x4AE17CF6U, 0xBA3C47C4U, 0xF1A9CC14U, 0x7E4D070AU, 0x92052BCBU, 0x9F823B72U,
+            0xC3402A93U, 0x55AC2279U, 0x91BC6149U, 0x46E2C7AAU, 0x10B66139U, 0xD76B1B48U, 0xE2276DA4U, 0xBE6511F4U,
+            0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0x0FFFFFFFU };
+
+        private const int L_0 = 0x04A7BB0D;     // L_0:26/24
+        private const int L_1 = 0x0873D6D5;     // L_1:27/23
+        private const int L_2 = 0x0A70AADC;     // L_2:27/26
+        private const int L_3 = 0x03D8D723;     // L_3:26/--
+        private const int L_4 = 0x096FDE93;     // L_4:27/25
+        private const int L_5 = 0x0B65129C;     // L_5:27/26
+        private const int L_6 = 0x063BB124;     // L_6:27/--
+        private const int L_7 = 0x08335DC1;     // L_7:27/22
+
+        private const int L4_0 = 0x029EEC34;    // L4_0:25/24
+        private const int L4_1 = 0x01CF5B55;    // L4_1:25/--
+        private const int L4_2 = 0x09C2AB72;    // L4_2:27/25
+        private const int L4_3 = 0x0F635C8E;    // L4_3:28/--
+        private const int L4_4 = 0x05BF7A4C;    // L4_4:26/25
+        private const int L4_5 = 0x0D944A72;    // L4_5:28/--
+        private const int L4_6 = 0x08EEC492;    // L4_6:27/24
+        private const int L4_7 = 0x20CD7705;    // L4_7:29/24
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static bool CheckVar(ReadOnlySpan<byte> s, Span<uint> n)
+        {
+            if (s[ScalarBytes - 1] != 0x00)
+                return false;
+
+            Decode(s, n);
+            return !Nat.Gte(Size, n, L);
+        }
+#else
+        internal static bool CheckVar(byte[] s, uint[] n)
+        {
+            if (s[ScalarBytes - 1] != 0x00)
+                return false;
+
+            Decode(s, n);
+            return !Nat.Gte(Size, n, L);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Decode(ReadOnlySpan<byte> k, Span<uint> n)
+        {
+            Debug.Assert(k[ScalarBytes - 1] == 0x00);
+
+            Codec.Decode32(k, n[..Size]);
+        }
+#else
+        internal static void Decode(byte[] k, uint[] n)
+        {
+            Debug.Assert(k[ScalarBytes - 1] == 0x00);
+
+            Codec.Decode32(k, 0, n, 0, Size);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void GetOrderWnafVar(int width, Span<sbyte> ws)
+#else
+        internal static void GetOrderWnafVar(int width, sbyte[] ws)
+#endif
+        {
+            Wnaf.GetSignedVar(L, width, ws);
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Multiply225Var(ReadOnlySpan<uint> x, ReadOnlySpan<uint> y225, Span<uint> z)
+        {
+            Span<uint> tt = stackalloc uint[29];
+            Nat.Mul(y225, x, tt);
+
+            if ((int)y225[7] < 0)
+            {
+                Nat.AddTo(14, L, tt[8..]);
+                Nat.SubFrom(14, x, tt[8..]);
+            }
+
+            Span<byte> r = MemoryMarshal.AsBytes(tt);
+            Reduce(r, r);
+            tt[..14].CopyTo(z);
+        }
+#else
+        internal static void Multiply225Var(uint[] x, uint[] y225, uint[] z)
+        {
+            uint[] tt = new uint[22];
+            Nat.Mul(y225, 0, 8, x, 0, 14, tt, 0);
+
+            if ((int)y225[7] < 0)
+            {
+                Nat.AddTo(14, L, 0, tt, 8);
+                Nat.SubFrom(14, x, 0, tt, 8);
+            }
+
+            byte[] bytes = new byte[114];
+            Codec.Encode32(tt, 0, 22, bytes, 0);
+
+            byte[] r = Reduce(bytes);
+            Decode(r, z);
+        }
+#endif
+
+        internal static byte[] Reduce(byte[] n)
+        {
+            byte[] r = new byte[ScalarBytes];
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Reduce(n, r);
+#else
+            ulong x00 =  Codec.Decode32(n,   0);                // x00:32/--
+            ulong x01 = (Codec.Decode24(n,   4) << 4);          // x01:28/--
+            ulong x02 =  Codec.Decode32(n,   7);                // x02:32/--
+            ulong x03 = (Codec.Decode24(n,  11) << 4);          // x03:28/--
+            ulong x04 =  Codec.Decode32(n,  14);                // x04:32/--
+            ulong x05 = (Codec.Decode24(n,  18) << 4);          // x05:28/--
+            ulong x06 =  Codec.Decode32(n,  21);                // x06:32/--
+            ulong x07 = (Codec.Decode24(n,  25) << 4);          // x07:28/--
+            ulong x08 =  Codec.Decode32(n,  28);                // x08:32/--
+            ulong x09 = (Codec.Decode24(n,  32) << 4);          // x09:28/--
+            ulong x10 =  Codec.Decode32(n,  35);                // x10:32/--
+            ulong x11 = (Codec.Decode24(n,  39) << 4);          // x11:28/--
+            ulong x12 =  Codec.Decode32(n,  42);                // x12:32/--
+            ulong x13 = (Codec.Decode24(n,  46) << 4);          // x13:28/--
+            ulong x14 =  Codec.Decode32(n,  49);                // x14:32/--
+            ulong x15 = (Codec.Decode24(n,  53) << 4);          // x15:28/--
+            ulong x16 =  Codec.Decode32(n,  56);                // x16:32/--
+            ulong x17 = (Codec.Decode24(n,  60) << 4);          // x17:28/--
+            ulong x18 =  Codec.Decode32(n,  63);                // x18:32/--
+            ulong x19 = (Codec.Decode24(n,  67) << 4);          // x19:28/--
+            ulong x20 =  Codec.Decode32(n,  70);                // x20:32/--
+            ulong x21 = (Codec.Decode24(n,  74) << 4);          // x21:28/--
+            ulong x22 =  Codec.Decode32(n,  77);                // x22:32/--
+            ulong x23 = (Codec.Decode24(n,  81) << 4);          // x23:28/--
+            ulong x24 =  Codec.Decode32(n,  84);                // x24:32/--
+            ulong x25 = (Codec.Decode24(n,  88) << 4);          // x25:28/--
+            ulong x26 =  Codec.Decode32(n,  91);                // x26:32/--
+            ulong x27 = (Codec.Decode24(n,  95) << 4);          // x27:28/--
+            ulong x28 =  Codec.Decode32(n,  98);                // x28:32/--
+            ulong x29 = (Codec.Decode24(n, 102) << 4);          // x29:28/--
+            ulong x30 =  Codec.Decode32(n, 105);                // x30:32/--
+            ulong x31 = (Codec.Decode24(n, 109) << 4);          // x31:28/--
+            ulong x32 =  Codec.Decode16(n, 112);                // x32:16/--
+
+            //x32 += (x31 >> 28); x31 &= M28UL;
+            x16 += x32 * L4_0;                          // x16:42/--
+            x17 += x32 * L4_1;                          // x17:41/28
+            x18 += x32 * L4_2;                          // x18:43/42
+            x19 += x32 * L4_3;                          // x19:44/28
+            x20 += x32 * L4_4;                          // x20:43/--
+            x21 += x32 * L4_5;                          // x21:44/28
+            x22 += x32 * L4_6;                          // x22:43/41
+            x23 += x32 * L4_7;                          // x23:45/41
+
+            x31 += (x30 >> 28); x30 &= M28UL;           // x31:28/--, x30:28/--
+            x15 += x31 * L4_0;                          // x15:54/--
+            x16 += x31 * L4_1;                          // x16:53/42
+            x17 += x31 * L4_2;                          // x17:55/54
+            x18 += x31 * L4_3;                          // x18:56/44
+            x19 += x31 * L4_4;                          // x19:55/--
+            x20 += x31 * L4_5;                          // x20:56/43
+            x21 += x31 * L4_6;                          // x21:55/53
+            x22 += x31 * L4_7;                          // x22:57/53
+
+            //x30 += (x29 >> 28); x29 &= M28UL;
+            x14 += x30 * L4_0;                          // x14:54/--
+            x15 += x30 * L4_1;                          // x15:54/53
+            x16 += x30 * L4_2;                          // x16:56/--
+            x17 += x30 * L4_3;                          // x17:57/--
+            x18 += x30 * L4_4;                          // x18:56/55
+            x19 += x30 * L4_5;                          // x19:56/55
+            x20 += x30 * L4_6;                          // x20:57/--
+            x21 += x30 * L4_7;                          // x21:57/56
+
+            x29 += (x28 >> 28); x28 &= M28UL;           // x29:28/--, x28:28/--
+            x13 += x29 * L4_0;                          // x13:54/--
+            x14 += x29 * L4_1;                          // x14:54/53
+            x15 += x29 * L4_2;                          // x15:56/--
+            x16 += x29 * L4_3;                          // x16:57/--
+            x17 += x29 * L4_4;                          // x17:57/55
+            x18 += x29 * L4_5;                          // x18:57/55
+            x19 += x29 * L4_6;                          // x19:57/52
+            x20 += x29 * L4_7;                          // x20:58/52
+
+            //x28 += (x27 >> 28); x27 &= M28UL;
+            x12 += x28 * L4_0;                          // x12:54/--
+            x13 += x28 * L4_1;                          // x13:54/53
+            x14 += x28 * L4_2;                          // x14:56/--
+            x15 += x28 * L4_3;                          // x15:57/--
+            x16 += x28 * L4_4;                          // x16:57/55
+            x17 += x28 * L4_5;                          // x17:58/--
+            x18 += x28 * L4_6;                          // x18:58/--
+            x19 += x28 * L4_7;                          // x19:58/53
+
+            x27 += (x26 >> 28); x26 &= M28UL;           // x27:28/--, x26:28/--
+            x11 += x27 * L4_0;                          // x11:54/--
+            x12 += x27 * L4_1;                          // x12:54/53
+            x13 += x27 * L4_2;                          // x13:56/--
+            x14 += x27 * L4_3;                          // x14:57/--
+            x15 += x27 * L4_4;                          // x15:57/55
+            x16 += x27 * L4_5;                          // x16:58/--
+            x17 += x27 * L4_6;                          // x17:58/56
+            x18 += x27 * L4_7;                          // x18:59/--
+
+            //x26 += (x25 >> 28); x25 &= M28UL;
+            x10 += x26 * L4_0;                          // x10:54/--
+            x11 += x26 * L4_1;                          // x11:54/53
+            x12 += x26 * L4_2;                          // x12:56/--
+            x13 += x26 * L4_3;                          // x13:57/--
+            x14 += x26 * L4_4;                          // x14:57/55
+            x15 += x26 * L4_5;                          // x15:58/--
+            x16 += x26 * L4_6;                          // x16:58/56
+            x17 += x26 * L4_7;                          // x17:59/--
+
+            x25 += (x24 >> 28); x24 &= M28UL;           // x25:28/--, x24:28/--
+            x09 += x25 * L4_0;                          // x09:54/--
+            x10 += x25 * L4_1;                          // x10:54/53
+            x11 += x25 * L4_2;                          // x11:56/--
+            x12 += x25 * L4_3;                          // x12:57/--
+            x13 += x25 * L4_4;                          // x13:57/55
+            x14 += x25 * L4_5;                          // x14:58/--
+            x15 += x25 * L4_6;                          // x15:58/56
+            x16 += x25 * L4_7;                          // x16:59/--
+
+            x21 += (x20 >> 28); x20 &= M28UL;           // x21:58/--, x20:28/--
+            x22 += (x21 >> 28); x21 &= M28UL;           // x22:57/54, x21:28/--
+            x23 += (x22 >> 28); x22 &= M28UL;           // x23:45/42, x22:28/--
+            x24 += (x23 >> 28); x23 &= M28UL;           // x24:28/18, x23:28/--
+
+            x08 += x24 * L4_0;                          // x08:54/--
+            x09 += x24 * L4_1;                          // x09:55/--
+            x10 += x24 * L4_2;                          // x10:56/46
+            x11 += x24 * L4_3;                          // x11:57/46
+            x12 += x24 * L4_4;                          // x12:57/55
+            x13 += x24 * L4_5;                          // x13:58/--
+            x14 += x24 * L4_6;                          // x14:58/56
+            x15 += x24 * L4_7;                          // x15:59/--
+
+            x07 += x23 * L4_0;                          // x07:54/--
+            x08 += x23 * L4_1;                          // x08:54/53
+            x09 += x23 * L4_2;                          // x09:56/53
+            x10 += x23 * L4_3;                          // x10:57/46
+            x11 += x23 * L4_4;                          // x11:57/55
+            x12 += x23 * L4_5;                          // x12:58/--
+            x13 += x23 * L4_6;                          // x13:58/56
+            x14 += x23 * L4_7;                          // x14:59/--
+
+            x06 += x22 * L4_0;                          // x06:54/--
+            x07 += x22 * L4_1;                          // x07:54/53
+            x08 += x22 * L4_2;                          // x08:56/--
+            x09 += x22 * L4_3;                          // x09:57/53
+            x10 += x22 * L4_4;                          // x10:57/55
+            x11 += x22 * L4_5;                          // x11:58/--
+            x12 += x22 * L4_6;                          // x12:58/56
+            x13 += x22 * L4_7;                          // x13:59/--
+
+            x18 += (x17 >> 28); x17 &= M28UL;           // x18:59/31, x17:28/--
+            x19 += (x18 >> 28); x18 &= M28UL;           // x19:58/54, x18:28/--
+            x20 += (x19 >> 28); x19 &= M28UL;           // x20:30/29, x19:28/--
+            x21 += (x20 >> 28); x20 &= M28UL;           // x21:28/03, x20:28/--
+
+            x05 += x21 * L4_0;                          // x05:54/--
+            x06 += x21 * L4_1;                          // x06:55/--
+            x07 += x21 * L4_2;                          // x07:56/31
+            x08 += x21 * L4_3;                          // x08:57/31
+            x09 += x21 * L4_4;                          // x09:57/56
+            x10 += x21 * L4_5;                          // x10:58/--
+            x11 += x21 * L4_6;                          // x11:58/56
+            x12 += x21 * L4_7;                          // x12:59/--
+
+            x04 += x20 * L4_0;                          // x04:54/--
+            x05 += x20 * L4_1;                          // x05:54/53
+            x06 += x20 * L4_2;                          // x06:56/53
+            x07 += x20 * L4_3;                          // x07:57/31
+            x08 += x20 * L4_4;                          // x08:57/55
+            x09 += x20 * L4_5;                          // x09:58/--
+            x10 += x20 * L4_6;                          // x10:58/56
+            x11 += x20 * L4_7;                          // x11:59/--
+
+            x03 += x19 * L4_0;                          // x03:54/--
+            x04 += x19 * L4_1;                          // x04:54/53
+            x05 += x19 * L4_2;                          // x05:56/--
+            x06 += x19 * L4_3;                          // x06:57/53
+            x07 += x19 * L4_4;                          // x07:57/55
+            x08 += x19 * L4_5;                          // x08:58/--
+            x09 += x19 * L4_6;                          // x09:58/56
+            x10 += x19 * L4_7;                          // x10:59/--
+
+            x15 += (x14 >> 28); x14 &= M28UL;           // x15:59/31, x14:28/--
+            x16 += (x15 >> 28); x15 &= M28UL;           // x16:59/32, x15:28/--
+            x17 += (x16 >> 28); x16 &= M28UL;           // x17:31/29, x16:28/--
+            x18 += (x17 >> 28); x17 &= M28UL;           // x18:28/04, x17:28/--
+
+            x02 += x18 * L4_0;                          // x02:54/--
+            x03 += x18 * L4_1;                          // x03:55/--
+            x04 += x18 * L4_2;                          // x04:56/32
+            x05 += x18 * L4_3;                          // x05:57/32
+            x06 += x18 * L4_4;                          // x06:57/56
+            x07 += x18 * L4_5;                          // x07:58/--
+            x08 += x18 * L4_6;                          // x08:58/56
+            x09 += x18 * L4_7;                          // x09:59/--
+
+            x01 += x17 * L4_0;                          // x01:54/--
+            x02 += x17 * L4_1;                          // x02:54/53
+            x03 += x17 * L4_2;                          // x03:56/53
+            x04 += x17 * L4_3;                          // x04:57/32
+            x05 += x17 * L4_4;                          // x05:57/55
+            x06 += x17 * L4_5;                          // x06:58/--
+            x07 += x17 * L4_6;                          // x07:58/56
+            x08 += x17 * L4_7;                          // x08:59/--
+
+            x16 *= 4;
+            x16 += (x15 >> 26); x15 &= M26UL;
+            x16 += 1;                                   // x16:30/01
+
+            x00 += x16 * L_0;
+            x01 += x16 * L_1;
+            x02 += x16 * L_2;
+            x03 += x16 * L_3;
+            x04 += x16 * L_4;
+            x05 += x16 * L_5;
+            x06 += x16 * L_6;
+            x07 += x16 * L_7;
+
+            x01 += (x00 >> 28); x00 &= M28UL;
+            x02 += (x01 >> 28); x01 &= M28UL;
+            x03 += (x02 >> 28); x02 &= M28UL;
+            x04 += (x03 >> 28); x03 &= M28UL;
+            x05 += (x04 >> 28); x04 &= M28UL;
+            x06 += (x05 >> 28); x05 &= M28UL;
+            x07 += (x06 >> 28); x06 &= M28UL;
+            x08 += (x07 >> 28); x07 &= M28UL;
+            x09 += (x08 >> 28); x08 &= M28UL;
+            x10 += (x09 >> 28); x09 &= M28UL;
+            x11 += (x10 >> 28); x10 &= M28UL;
+            x12 += (x11 >> 28); x11 &= M28UL;
+            x13 += (x12 >> 28); x12 &= M28UL;
+            x14 += (x13 >> 28); x13 &= M28UL;
+            x15 += (x14 >> 28); x14 &= M28UL;
+            x16  = (x15 >> 26); x15 &= M26UL;
+
+            x16 -= 1;
+
+            Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue);
+
+            x00 -= x16 & L_0;
+            x01 -= x16 & L_1;
+            x02 -= x16 & L_2;
+            x03 -= x16 & L_3;
+            x04 -= x16 & L_4;
+            x05 -= x16 & L_5;
+            x06 -= x16 & L_6;
+            x07 -= x16 & L_7;
+
+            x01 += (ulong)((long)x00 >> 28); x00 &= M28UL;
+            x02 += (ulong)((long)x01 >> 28); x01 &= M28UL;
+            x03 += (ulong)((long)x02 >> 28); x02 &= M28UL;
+            x04 += (ulong)((long)x03 >> 28); x03 &= M28UL;
+            x05 += (ulong)((long)x04 >> 28); x04 &= M28UL;
+            x06 += (ulong)((long)x05 >> 28); x05 &= M28UL;
+            x07 += (ulong)((long)x06 >> 28); x06 &= M28UL;
+            x08 += (ulong)((long)x07 >> 28); x07 &= M28UL;
+            x09 += (ulong)((long)x08 >> 28); x08 &= M28UL;
+            x10 += (ulong)((long)x09 >> 28); x09 &= M28UL;
+            x11 += (ulong)((long)x10 >> 28); x10 &= M28UL;
+            x12 += (ulong)((long)x11 >> 28); x11 &= M28UL;
+            x13 += (ulong)((long)x12 >> 28); x12 &= M28UL;
+            x14 += (ulong)((long)x13 >> 28); x13 &= M28UL;
+            x15 += (ulong)((long)x14 >> 28); x14 &= M28UL;
+
+            Debug.Assert(x15 >> 26 == 0UL);
+
+            Codec.Encode56(x00 | (x01 << 28), r,  0);
+            Codec.Encode56(x02 | (x03 << 28), r,  7);
+            Codec.Encode56(x04 | (x05 << 28), r, 14);
+            Codec.Encode56(x06 | (x07 << 28), r, 21);
+            Codec.Encode56(x08 | (x09 << 28), r, 28);
+            Codec.Encode56(x10 | (x11 << 28), r, 35);
+            Codec.Encode56(x12 | (x13 << 28), r, 42);
+            Codec.Encode56(x14 | (x15 << 28), r, 49);
+            //r[ScalarBytes - 1] = 0;
+#endif
+
+            return r;
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Reduce(ReadOnlySpan<byte> n, Span<byte> r)
+        {
+            ulong x00 =  Codec.Decode32(n[  0..]);              // x00:32/--
+            ulong x01 = (Codec.Decode24(n[  4..]) << 4);        // x01:28/--
+            ulong x02 =  Codec.Decode32(n[  7..]);              // x02:32/--
+            ulong x03 = (Codec.Decode24(n[ 11..]) << 4);        // x03:28/--
+            ulong x04 =  Codec.Decode32(n[ 14..]);              // x04:32/--
+            ulong x05 = (Codec.Decode24(n[ 18..]) << 4);        // x05:28/--
+            ulong x06 =  Codec.Decode32(n[ 21..]);              // x06:32/--
+            ulong x07 = (Codec.Decode24(n[ 25..]) << 4);        // x07:28/--
+            ulong x08 =  Codec.Decode32(n[ 28..]);              // x08:32/--
+            ulong x09 = (Codec.Decode24(n[ 32..]) << 4);        // x09:28/--
+            ulong x10 =  Codec.Decode32(n[ 35..]);              // x10:32/--
+            ulong x11 = (Codec.Decode24(n[ 39..]) << 4);        // x11:28/--
+            ulong x12 =  Codec.Decode32(n[ 42..]);              // x12:32/--
+            ulong x13 = (Codec.Decode24(n[ 46..]) << 4);        // x13:28/--
+            ulong x14 =  Codec.Decode32(n[ 49..]);              // x14:32/--
+            ulong x15 = (Codec.Decode24(n[ 53..]) << 4);        // x15:28/--
+            ulong x16 =  Codec.Decode32(n[ 56..]);              // x16:32/--
+            ulong x17 = (Codec.Decode24(n[ 60..]) << 4);        // x17:28/--
+            ulong x18 =  Codec.Decode32(n[ 63..]);              // x18:32/--
+            ulong x19 = (Codec.Decode24(n[ 67..]) << 4);        // x19:28/--
+            ulong x20 =  Codec.Decode32(n[ 70..]);              // x20:32/--
+            ulong x21 = (Codec.Decode24(n[ 74..]) << 4);        // x21:28/--
+            ulong x22 =  Codec.Decode32(n[ 77..]);              // x22:32/--
+            ulong x23 = (Codec.Decode24(n[ 81..]) << 4);        // x23:28/--
+            ulong x24 =  Codec.Decode32(n[ 84..]);              // x24:32/--
+            ulong x25 = (Codec.Decode24(n[ 88..]) << 4);        // x25:28/--
+            ulong x26 =  Codec.Decode32(n[ 91..]);              // x26:32/--
+            ulong x27 = (Codec.Decode24(n[ 95..]) << 4);        // x27:28/--
+            ulong x28 =  Codec.Decode32(n[ 98..]);              // x28:32/--
+            ulong x29 = (Codec.Decode24(n[102..]) << 4);        // x29:28/--
+            ulong x30 =  Codec.Decode32(n[105..]);              // x30:32/--
+            ulong x31 = (Codec.Decode24(n[109..]) << 4);        // x31:28/--
+            ulong x32 =  Codec.Decode16(n[112..]);              // x32:16/--
+
+            //x32 += (x31 >> 28); x31 &= M28UL;
+            x16 += x32 * L4_0;                          // x16:42/--
+            x17 += x32 * L4_1;                          // x17:41/28
+            x18 += x32 * L4_2;                          // x18:43/42
+            x19 += x32 * L4_3;                          // x19:44/28
+            x20 += x32 * L4_4;                          // x20:43/--
+            x21 += x32 * L4_5;                          // x21:44/28
+            x22 += x32 * L4_6;                          // x22:43/41
+            x23 += x32 * L4_7;                          // x23:45/41
+
+            x31 += (x30 >> 28); x30 &= M28UL;           // x31:28/--, x30:28/--
+            x15 += x31 * L4_0;                          // x15:54/--
+            x16 += x31 * L4_1;                          // x16:53/42
+            x17 += x31 * L4_2;                          // x17:55/54
+            x18 += x31 * L4_3;                          // x18:56/44
+            x19 += x31 * L4_4;                          // x19:55/--
+            x20 += x31 * L4_5;                          // x20:56/43
+            x21 += x31 * L4_6;                          // x21:55/53
+            x22 += x31 * L4_7;                          // x22:57/53
+
+            //x30 += (x29 >> 28); x29 &= M28UL;
+            x14 += x30 * L4_0;                          // x14:54/--
+            x15 += x30 * L4_1;                          // x15:54/53
+            x16 += x30 * L4_2;                          // x16:56/--
+            x17 += x30 * L4_3;                          // x17:57/--
+            x18 += x30 * L4_4;                          // x18:56/55
+            x19 += x30 * L4_5;                          // x19:56/55
+            x20 += x30 * L4_6;                          // x20:57/--
+            x21 += x30 * L4_7;                          // x21:57/56
+
+            x29 += (x28 >> 28); x28 &= M28UL;           // x29:28/--, x28:28/--
+            x13 += x29 * L4_0;                          // x13:54/--
+            x14 += x29 * L4_1;                          // x14:54/53
+            x15 += x29 * L4_2;                          // x15:56/--
+            x16 += x29 * L4_3;                          // x16:57/--
+            x17 += x29 * L4_4;                          // x17:57/55
+            x18 += x29 * L4_5;                          // x18:57/55
+            x19 += x29 * L4_6;                          // x19:57/52
+            x20 += x29 * L4_7;                          // x20:58/52
+
+            //x28 += (x27 >> 28); x27 &= M28UL;
+            x12 += x28 * L4_0;                          // x12:54/--
+            x13 += x28 * L4_1;                          // x13:54/53
+            x14 += x28 * L4_2;                          // x14:56/--
+            x15 += x28 * L4_3;                          // x15:57/--
+            x16 += x28 * L4_4;                          // x16:57/55
+            x17 += x28 * L4_5;                          // x17:58/--
+            x18 += x28 * L4_6;                          // x18:58/--
+            x19 += x28 * L4_7;                          // x19:58/53
+
+            x27 += (x26 >> 28); x26 &= M28UL;           // x27:28/--, x26:28/--
+            x11 += x27 * L4_0;                          // x11:54/--
+            x12 += x27 * L4_1;                          // x12:54/53
+            x13 += x27 * L4_2;                          // x13:56/--
+            x14 += x27 * L4_3;                          // x14:57/--
+            x15 += x27 * L4_4;                          // x15:57/55
+            x16 += x27 * L4_5;                          // x16:58/--
+            x17 += x27 * L4_6;                          // x17:58/56
+            x18 += x27 * L4_7;                          // x18:59/--
+
+            //x26 += (x25 >> 28); x25 &= M28UL;
+            x10 += x26 * L4_0;                          // x10:54/--
+            x11 += x26 * L4_1;                          // x11:54/53
+            x12 += x26 * L4_2;                          // x12:56/--
+            x13 += x26 * L4_3;                          // x13:57/--
+            x14 += x26 * L4_4;                          // x14:57/55
+            x15 += x26 * L4_5;                          // x15:58/--
+            x16 += x26 * L4_6;                          // x16:58/56
+            x17 += x26 * L4_7;                          // x17:59/--
+
+            x25 += (x24 >> 28); x24 &= M28UL;           // x25:28/--, x24:28/--
+            x09 += x25 * L4_0;                          // x09:54/--
+            x10 += x25 * L4_1;                          // x10:54/53
+            x11 += x25 * L4_2;                          // x11:56/--
+            x12 += x25 * L4_3;                          // x12:57/--
+            x13 += x25 * L4_4;                          // x13:57/55
+            x14 += x25 * L4_5;                          // x14:58/--
+            x15 += x25 * L4_6;                          // x15:58/56
+            x16 += x25 * L4_7;                          // x16:59/--
+
+            x21 += (x20 >> 28); x20 &= M28UL;           // x21:58/--, x20:28/--
+            x22 += (x21 >> 28); x21 &= M28UL;           // x22:57/54, x21:28/--
+            x23 += (x22 >> 28); x22 &= M28UL;           // x23:45/42, x22:28/--
+            x24 += (x23 >> 28); x23 &= M28UL;           // x24:28/18, x23:28/--
+
+            x08 += x24 * L4_0;                          // x08:54/--
+            x09 += x24 * L4_1;                          // x09:55/--
+            x10 += x24 * L4_2;                          // x10:56/46
+            x11 += x24 * L4_3;                          // x11:57/46
+            x12 += x24 * L4_4;                          // x12:57/55
+            x13 += x24 * L4_5;                          // x13:58/--
+            x14 += x24 * L4_6;                          // x14:58/56
+            x15 += x24 * L4_7;                          // x15:59/--
+
+            x07 += x23 * L4_0;                          // x07:54/--
+            x08 += x23 * L4_1;                          // x08:54/53
+            x09 += x23 * L4_2;                          // x09:56/53
+            x10 += x23 * L4_3;                          // x10:57/46
+            x11 += x23 * L4_4;                          // x11:57/55
+            x12 += x23 * L4_5;                          // x12:58/--
+            x13 += x23 * L4_6;                          // x13:58/56
+            x14 += x23 * L4_7;                          // x14:59/--
+
+            x06 += x22 * L4_0;                          // x06:54/--
+            x07 += x22 * L4_1;                          // x07:54/53
+            x08 += x22 * L4_2;                          // x08:56/--
+            x09 += x22 * L4_3;                          // x09:57/53
+            x10 += x22 * L4_4;                          // x10:57/55
+            x11 += x22 * L4_5;                          // x11:58/--
+            x12 += x22 * L4_6;                          // x12:58/56
+            x13 += x22 * L4_7;                          // x13:59/--
+
+            x18 += (x17 >> 28); x17 &= M28UL;           // x18:59/31, x17:28/--
+            x19 += (x18 >> 28); x18 &= M28UL;           // x19:58/54, x18:28/--
+            x20 += (x19 >> 28); x19 &= M28UL;           // x20:30/29, x19:28/--
+            x21 += (x20 >> 28); x20 &= M28UL;           // x21:28/03, x20:28/--
+
+            x05 += x21 * L4_0;                          // x05:54/--
+            x06 += x21 * L4_1;                          // x06:55/--
+            x07 += x21 * L4_2;                          // x07:56/31
+            x08 += x21 * L4_3;                          // x08:57/31
+            x09 += x21 * L4_4;                          // x09:57/56
+            x10 += x21 * L4_5;                          // x10:58/--
+            x11 += x21 * L4_6;                          // x11:58/56
+            x12 += x21 * L4_7;                          // x12:59/--
+
+            x04 += x20 * L4_0;                          // x04:54/--
+            x05 += x20 * L4_1;                          // x05:54/53
+            x06 += x20 * L4_2;                          // x06:56/53
+            x07 += x20 * L4_3;                          // x07:57/31
+            x08 += x20 * L4_4;                          // x08:57/55
+            x09 += x20 * L4_5;                          // x09:58/--
+            x10 += x20 * L4_6;                          // x10:58/56
+            x11 += x20 * L4_7;                          // x11:59/--
+
+            x03 += x19 * L4_0;                          // x03:54/--
+            x04 += x19 * L4_1;                          // x04:54/53
+            x05 += x19 * L4_2;                          // x05:56/--
+            x06 += x19 * L4_3;                          // x06:57/53
+            x07 += x19 * L4_4;                          // x07:57/55
+            x08 += x19 * L4_5;                          // x08:58/--
+            x09 += x19 * L4_6;                          // x09:58/56
+            x10 += x19 * L4_7;                          // x10:59/--
+
+            x15 += (x14 >> 28); x14 &= M28UL;           // x15:59/31, x14:28/--
+            x16 += (x15 >> 28); x15 &= M28UL;           // x16:59/32, x15:28/--
+            x17 += (x16 >> 28); x16 &= M28UL;           // x17:31/29, x16:28/--
+            x18 += (x17 >> 28); x17 &= M28UL;           // x18:28/04, x17:28/--
+
+            x02 += x18 * L4_0;                          // x02:54/--
+            x03 += x18 * L4_1;                          // x03:55/--
+            x04 += x18 * L4_2;                          // x04:56/32
+            x05 += x18 * L4_3;                          // x05:57/32
+            x06 += x18 * L4_4;                          // x06:57/56
+            x07 += x18 * L4_5;                          // x07:58/--
+            x08 += x18 * L4_6;                          // x08:58/56
+            x09 += x18 * L4_7;                          // x09:59/--
+
+            x01 += x17 * L4_0;                          // x01:54/--
+            x02 += x17 * L4_1;                          // x02:54/53
+            x03 += x17 * L4_2;                          // x03:56/53
+            x04 += x17 * L4_3;                          // x04:57/32
+            x05 += x17 * L4_4;                          // x05:57/55
+            x06 += x17 * L4_5;                          // x06:58/--
+            x07 += x17 * L4_6;                          // x07:58/56
+            x08 += x17 * L4_7;                          // x08:59/--
+
+            x16 *= 4;
+            x16 += (x15 >> 26); x15 &= M26UL;
+            x16 += 1;                                   // x16:30/01
+
+            x00 += x16 * L_0;
+            x01 += x16 * L_1;
+            x02 += x16 * L_2;
+            x03 += x16 * L_3;
+            x04 += x16 * L_4;
+            x05 += x16 * L_5;
+            x06 += x16 * L_6;
+            x07 += x16 * L_7;
+
+            x01 += (x00 >> 28); x00 &= M28UL;
+            x02 += (x01 >> 28); x01 &= M28UL;
+            x03 += (x02 >> 28); x02 &= M28UL;
+            x04 += (x03 >> 28); x03 &= M28UL;
+            x05 += (x04 >> 28); x04 &= M28UL;
+            x06 += (x05 >> 28); x05 &= M28UL;
+            x07 += (x06 >> 28); x06 &= M28UL;
+            x08 += (x07 >> 28); x07 &= M28UL;
+            x09 += (x08 >> 28); x08 &= M28UL;
+            x10 += (x09 >> 28); x09 &= M28UL;
+            x11 += (x10 >> 28); x10 &= M28UL;
+            x12 += (x11 >> 28); x11 &= M28UL;
+            x13 += (x12 >> 28); x12 &= M28UL;
+            x14 += (x13 >> 28); x13 &= M28UL;
+            x15 += (x14 >> 28); x14 &= M28UL;
+            x16  = (x15 >> 26); x15 &= M26UL;
+
+            x16 -= 1;
+
+            Debug.Assert(x16 == 0UL || x16 == ulong.MaxValue);
+
+            x00 -= x16 & L_0;
+            x01 -= x16 & L_1;
+            x02 -= x16 & L_2;
+            x03 -= x16 & L_3;
+            x04 -= x16 & L_4;
+            x05 -= x16 & L_5;
+            x06 -= x16 & L_6;
+            x07 -= x16 & L_7;
+
+            x01 += (ulong)((long)x00 >> 28); x00 &= M28UL;
+            x02 += (ulong)((long)x01 >> 28); x01 &= M28UL;
+            x03 += (ulong)((long)x02 >> 28); x02 &= M28UL;
+            x04 += (ulong)((long)x03 >> 28); x03 &= M28UL;
+            x05 += (ulong)((long)x04 >> 28); x04 &= M28UL;
+            x06 += (ulong)((long)x05 >> 28); x05 &= M28UL;
+            x07 += (ulong)((long)x06 >> 28); x06 &= M28UL;
+            x08 += (ulong)((long)x07 >> 28); x07 &= M28UL;
+            x09 += (ulong)((long)x08 >> 28); x08 &= M28UL;
+            x10 += (ulong)((long)x09 >> 28); x09 &= M28UL;
+            x11 += (ulong)((long)x10 >> 28); x10 &= M28UL;
+            x12 += (ulong)((long)x11 >> 28); x11 &= M28UL;
+            x13 += (ulong)((long)x12 >> 28); x12 &= M28UL;
+            x14 += (ulong)((long)x13 >> 28); x13 &= M28UL;
+            x15 += (ulong)((long)x14 >> 28); x14 &= M28UL;
+
+            Debug.Assert(x15 >> 26 == 0UL);
+
+            Codec.Encode56(x00 | (x01 << 28), r);
+            Codec.Encode56(x02 | (x03 << 28), r[7..]);
+            Codec.Encode56(x04 | (x05 << 28), r[14..]);
+            Codec.Encode56(x06 | (x07 << 28), r[21..]);
+            Codec.Encode56(x08 | (x09 << 28), r[28..]);
+            Codec.Encode56(x10 | (x11 << 28), r[35..]);
+            Codec.Encode56(x12 | (x13 << 28), r[42..]);
+            Codec.Encode56(x14 | (x15 << 28), r[49..]);
+            r[ScalarBytes - 1] = 0;
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void ReduceBasisVar(ReadOnlySpan<uint> k, Span<uint> z0, Span<uint> z1)
+        {
+            /*
+             * Split scalar k into two half-size scalars z0 and z1, such that z1 * k == z0 mod L.
+             * 
+             * See https://ia.cr/2020/454 (Pornin).
+             */
+
+            Span<uint> Nu = stackalloc uint[28];    LSq.CopyTo(Nu);
+            Span<uint> Nv = stackalloc uint[28];    Nat.Square(14, k, Nv); Nat.AddWordTo(28, 1U, Nv);
+            Span<uint> p  = stackalloc uint[28];    Nat.Mul(14, L, k, p);
+            Span<uint> u0 = stackalloc uint[8];     u0.CopyFrom(L);
+            Span<uint> u1 = stackalloc uint[8];
+            Span<uint> v0 = stackalloc uint[8];     v0.CopyFrom(k);
+            Span<uint> v1 = stackalloc uint[8];     v1[0] = 1U;
+
+            int last = 27;
+            int len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
+
+            while (len_Nv > TargetLength)
+            {
+                int len_p = ScalarUtilities.GetBitLength(last, p);
+                int s = len_p - len_Nv;
+                s &= ~(s >> 31);
+
+                if ((int)p[last] < 0)
+                {
+                    ScalarUtilities.AddShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.AddShifted_UV(last: 7, s, u0, u1, v0, v1);
+                }
+                else
+                {
+                    ScalarUtilities.SubShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.SubShifted_UV(last: 7, s, u0, u1, v0, v1);
+                }
+
+                if (ScalarUtilities.LessThan(last, Nu, Nv))
+                {
+                    ScalarUtilities.Swap(ref u0, ref v0);
+                    ScalarUtilities.Swap(ref u1, ref v1);
+                    ScalarUtilities.Swap(ref Nu, ref Nv);
+
+                    last = len_Nv >> 5;
+                    len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
+                }
+            }
+
+            Debug.Assert((int)v0[7] >> 31 == (int)v0[7] >> 1);
+            Debug.Assert((int)v1[7] >> 31 == (int)v1[7] >> 1);
+
+            // v1 * k == v0 mod L
+            v0.CopyTo(z0);
+            v1.CopyTo(z1);
+        }
+#else
+        internal static void ReduceBasisVar(uint[] k, uint[] z0, uint[] z1)
+        {
+            /*
+             * Split scalar k into two half-size scalars z0 and z1, such that z1 * k == z0 mod L.
+             * 
+             * See https://ia.cr/2020/454 (Pornin).
+             */
+
+            uint[] Nu = new uint[28];       Array.Copy(LSq, Nu, 28);
+            uint[] Nv = new uint[28];       Nat.Square(14, k, Nv); Nat.AddWordTo(28, 1U, Nv);
+            uint[] p  = new uint[28];       Nat.Mul(14, L, k, p);
+            uint[] u0 = new uint[8];        Array.Copy(L, u0, 8);
+            uint[] u1 = new uint[8];
+            uint[] v0 = new uint[8];        Array.Copy(k, v0, 8);
+            uint[] v1 = new uint[8];        v1[0] = 1U;
+
+            int last = 27;
+            int len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
+
+            while (len_Nv > TargetLength)
+            {
+                int len_p = ScalarUtilities.GetBitLength(last, p);
+                int s = len_p - len_Nv;
+                s &= ~(s >> 31);
+
+                if ((int)p[last] < 0)
+                {
+                    ScalarUtilities.AddShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.AddShifted_UV(last: 7, s, u0, u1, v0, v1);
+                }
+                else
+                {
+                    ScalarUtilities.SubShifted_NP(last, s, Nu, Nv, p);
+                    ScalarUtilities.SubShifted_UV(last: 7, s, u0, u1, v0, v1);
+                }
+
+                if (ScalarUtilities.LessThan(last, Nu, Nv))
+                {
+                    ScalarUtilities.Swap(ref u0, ref v0);
+                    ScalarUtilities.Swap(ref u1, ref v1);
+                    ScalarUtilities.Swap(ref Nu, ref Nv);
+
+                    last = len_Nv >> 5;
+                    len_Nv = ScalarUtilities.GetBitLengthPositive(last, Nv);
+                }
+            }
+
+            Debug.Assert((int)v0[7] >> 31 == (int)v0[7] >> 1);
+            Debug.Assert((int)v1[7] >> 31 == (int)v1[7] >> 1);
+
+            // v1 * k == v0 mod L
+            Array.Copy(v0, z0, 8);
+            Array.Copy(v1, z1, 8);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void ToSignedDigits(int bits, ReadOnlySpan<uint> x, Span<uint> z)
+#else
+        internal static void ToSignedDigits(int bits, uint[] x, uint[] z)
+#endif
+        {
+            Debug.Assert(448 < bits && bits < 480);
+            Debug.Assert(z.Length > Size);
+
+            z[Size] = (1U << (bits - 448))
+                    + Nat.CAdd(Size, ~(int)x[0] & 1, x, L, z);
+            uint c = Nat.ShiftDownBit(Size + 1, z, 0);
+            Debug.Assert(c == (1U << 31));
+        }
+    }
+}
diff --git a/crypto/src/math/ec/rfc8032/ScalarUtilities.cs b/crypto/src/math/ec/rfc8032/ScalarUtilities.cs
new file mode 100644
index 000000000..3407c65c7
--- /dev/null
+++ b/crypto/src/math/ec/rfc8032/ScalarUtilities.cs
@@ -0,0 +1,294 @@
+using System;
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#endif
+
+using Org.BouncyCastle.Utilities;
+
+namespace Org.BouncyCastle.Math.EC.Rfc8032
+{
+    internal static class ScalarUtilities
+    {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void AddShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
+#else
+        internal static void AddShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            ulong cc__p = 0UL;
+            ulong cc_Nu = 0UL;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_Nu += Nu[i];
+                    cc_Nu += _p[i - sWords];
+
+                    cc__p += _p[i];
+                    cc__p += Nv[i - sWords];
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    cc_Nu += _p[i - sWords];
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_p = 0U;
+                uint prev_q = 0U;
+                uint prev_v = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_p = _p[i - sWords];
+                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
+                    prev_p = next_p;
+
+                    cc_Nu += Nu[i];
+                    cc_Nu += p_s;
+
+                    uint next_v = Nv[i - sWords];
+                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
+                    prev_v = next_v;
+
+                    cc__p += _p[i];
+                    cc__p += v_s;
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    uint next_q = _p[i - sWords];
+                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
+                    prev_q = next_q;
+
+                    cc_Nu += q_s;
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void AddShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
+            ReadOnlySpan<uint> v1)
+#else
+        internal static void AddShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            ulong cc_u0 = 0UL;
+            ulong cc_u1 = 0UL;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 += v0[i - sWords];
+                    cc_u1 += v1[i - sWords];
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_v0 = 0U;
+                uint prev_v1 = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_v0 = v0[i - sWords];
+                    uint next_v1 = v1[i - sWords];
+                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
+                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
+                    prev_v0 = next_v0;
+                    prev_v1 = next_v1;
+
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 += v0_s;
+                    cc_u1 += v1_s;
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int GetBitLength(int last, ReadOnlySpan<uint> x)
+#else
+        internal static int GetBitLength(int last, uint[] x)
+#endif
+        {
+            int i = last;
+            uint sign = (uint)((int)x[i] >> 31);
+            while (i > 0 && x[i] == sign)
+            {
+                --i;
+            }
+            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)(x[i] ^ sign));
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int GetBitLengthPositive(int last, ReadOnlySpan<uint> x)
+#else
+        internal static int GetBitLengthPositive(int last, uint[] x)
+#endif
+        {
+            int i = last;
+            while (i > 0 && x[i] == 0)
+            {
+                --i;
+            }
+            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)x[i]);
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool LessThan(int last, ReadOnlySpan<uint> x, ReadOnlySpan<uint> y)
+#else
+        internal static bool LessThan(int last, uint[] x, uint[] y)
+#endif
+        {
+            int i = last;
+            if ((int)x[i] < (int)y[i])
+                return true;
+            if ((int)x[i] > (int)y[i])
+                return false;
+            while (--i >= 0)
+            {
+                if (x[i] < y[i])
+                    return true;
+                if (x[i] > y[i])
+                    return false;
+            }
+            return false;
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void SubShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
+#else
+        internal static void SubShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            long cc__p = 0L;
+            long cc_Nu = 0L;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_Nu += Nu[i];
+                    cc_Nu -= _p[i - sWords];
+
+                    cc__p += _p[i];
+                    cc__p -= Nv[i - sWords];
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    cc_Nu -= _p[i - sWords];
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_p = 0U;
+                uint prev_q = 0U;
+                uint prev_v = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_p = _p[i - sWords];
+                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
+                    prev_p = next_p;
+
+                    cc_Nu += Nu[i];
+                    cc_Nu -= p_s;
+
+                    uint next_v = Nv[i - sWords];
+                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
+                    prev_v = next_v;
+
+                    cc__p += _p[i];
+                    cc__p -= v_s;
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    uint next_q = _p[i - sWords];
+                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
+                    prev_q = next_q;
+
+                    cc_Nu -= q_s;
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void SubShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
+            ReadOnlySpan<uint> v1)
+#else
+        internal static void SubShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            long cc_u0 = 0L;
+            long cc_u1 = 0L;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 -= v0[i - sWords];
+                    cc_u1 -= v1[i - sWords];
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_v0 = 0U;
+                uint prev_v1 = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_v0 = v0[i - sWords];
+                    uint next_v1 = v1[i - sWords];
+                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
+                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
+                    prev_v0 = next_v0;
+                    prev_v1 = next_v1;
+
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 -= v0_s;
+                    cc_u1 -= v1_s;
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void Swap(ref Span<uint> x, ref Span<uint> y)
+#else
+        internal static void Swap(ref uint[] x, ref uint[] y)
+#endif
+        {
+            var t = x; x = y; y = t;
+        }
+    }
+}