summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--crypto/src/math/ec/rfc7748/X25519Field.cs13
-rw-r--r--crypto/src/math/ec/rfc8032/Codec.cs18
-rw-r--r--crypto/src/math/ec/rfc8032/Ed25519.cs699
-rw-r--r--crypto/src/math/ec/rfc8032/Scalar25519.cs782
-rw-r--r--crypto/src/math/ec/rfc8032/Wnaf.cs2
-rw-r--r--crypto/test/src/math/ec/rfc8032/test/Ed25519Test.cs6
6 files changed, 1011 insertions, 509 deletions
diff --git a/crypto/src/math/ec/rfc7748/X25519Field.cs b/crypto/src/math/ec/rfc7748/X25519Field.cs
index 241710fe9..2504592aa 100644
--- a/crypto/src/math/ec/rfc7748/X25519Field.cs
+++ b/crypto/src/math/ec/rfc7748/X25519Field.cs
@@ -319,6 +319,13 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748
         }
 #endif
 
+        public static void Decode(byte[] x, int[] z)
+        {
+            Decode128(x, 0, z, 0);
+            Decode128(x, 16, z, 5);
+            z[9] &= M24;
+        }
+
         public static void Decode(byte[] x, int xOff, int[] z)
         {
             Decode128(x, xOff, z, 0);
@@ -425,6 +432,12 @@ namespace Org.BouncyCastle.Math.EC.Rfc7748
         }
 #endif
 
+        public static void Encode(int[] x, byte[] z)
+        {
+            Encode128(x, 0, z, 0);
+            Encode128(x, 5, z, 16);
+        }
+
         public static void Encode(int[] x, byte[] z, int zOff)
         {
             Encode128(x, 0, z, zOff);
diff --git a/crypto/src/math/ec/rfc8032/Codec.cs b/crypto/src/math/ec/rfc8032/Codec.cs
index 3aacd12ab..cf1b69b27 100644
--- a/crypto/src/math/ec/rfc8032/Codec.cs
+++ b/crypto/src/math/ec/rfc8032/Codec.cs
@@ -116,6 +116,24 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 #endif
 
+        internal static void Encode32(uint[] n, int nOff, int nLen, byte[] bs, int bsOff)
+        {
+            for (int i = 0; i < nLen; ++i)
+            {
+                Encode32(n[nOff + i], bs, bsOff + i * 4);
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Encode32(ReadOnlySpan<uint> n, Span<byte> bs)
+        {
+            for (int i = 0; i < n.Length; ++i)
+            {
+                Encode32(n[i], bs[(i * 4)..]);
+            }
+        }
+#endif
+
         internal static void Encode56(ulong n, byte[] bs, int off)
         {
             Encode32((uint)n, bs, off);
diff --git a/crypto/src/math/ec/rfc8032/Ed25519.cs b/crypto/src/math/ec/rfc8032/Ed25519.cs
index c3e2b5b7e..0c95fade3 100644
--- a/crypto/src/math/ec/rfc8032/Ed25519.cs
+++ b/crypto/src/math/ec/rfc8032/Ed25519.cs
@@ -4,6 +4,7 @@ using System.Diagnostics;
 using Org.BouncyCastle.Crypto;
 using Org.BouncyCastle.Crypto.Digests;
 using Org.BouncyCastle.Math.Raw;
+using Org.BouncyCastle.Pqc.Crypto.SphincsPlus;
 using Org.BouncyCastle.Security;
 using Org.BouncyCastle.Utilities;
 
@@ -34,10 +35,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             Ed25519ph = 2,
         }
 
-        private const long M08L = 0x000000FFL;
-        private const long M28L = 0x0FFFFFFFL;
-        private const long M32L = 0xFFFFFFFFL;
-
         private const int CoordUints = 8;
         private const int PointBytes = CoordUints * 4;
         private const int ScalarUints = 8;
@@ -55,14 +52,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         private static readonly uint[] P = { 0xFFFFFFEDU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU,
             0xFFFFFFFFU, 0xFFFFFFFFU, 0x7FFFFFFFU };
-        private static readonly uint[] L = { 0x5CF5D3EDU, 0x5812631AU, 0xA2F79CD6U, 0x14DEF9DEU, 0x00000000U,
-            0x00000000U, 0x00000000U, 0x10000000U };
-
-        private const int L0 = -0x030A2C13;     // L0:26/--
-        private const int L1 =  0x012631A6;     // L1:24/22
-        private const int L2 =  0x079CD658;     // L2:27/--
-        private const int L3 = -0x006215D1;     // L3:23/--
-        private const int L4 =  0x000014DF;     // L4:12/11
 
         private static readonly uint[] Order8_y1 = { 0x706A17C7, 0x4FD84D3D, 0x760B3CBA, 0x0F67100D, 0xFA53202A,
             0xC6CC392C, 0x77FDC74E, 0x7A03AC92 };
@@ -142,9 +131,9 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         private static byte[] CalculateS(byte[] r, byte[] k, byte[] s)
         {
-            uint[] t = new uint[ScalarUints * 2];   DecodeScalar(r, 0, t);
-            uint[] u = new uint[ScalarUints];       DecodeScalar(k, 0, u);
-            uint[] v = new uint[ScalarUints];       DecodeScalar(s, 0, v);
+            uint[] t = new uint[ScalarUints * 2];   Scalar25519.Decode(r, t);
+            uint[] u = new uint[ScalarUints];       Scalar25519.Decode(k, u);
+            uint[] v = new uint[ScalarUints];       Scalar25519.Decode(s, v);
 
             Nat256.MulAddTo(u, v, t);
 
@@ -153,7 +142,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             {
                 Codec.Encode32(t[i], result, i * 4);
             }
-            return ReduceScalar(result);
+            return Scalar25519.Reduce(result);
         }
 
         private static bool CheckContextVar(byte[] ctx, byte phflag)
@@ -228,6 +217,43 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 #endif
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static bool CheckPointFullVar(ReadOnlySpan<byte> p)
+        {
+            uint y7 = Codec.Decode32(p[28..]) & 0x7FFFFFFFU;
+
+            uint t0 = y7;
+            uint t1 = y7 ^ P[7];
+            uint t2 = y7 ^ Order8_y1[7];
+            uint t3 = y7 ^ Order8_y2[7];
+
+            for (int i = CoordUints - 2; i > 0; --i)
+            {
+                uint yi = Codec.Decode32(p[(i * 4)..]);
+
+                t0 |= yi;
+                t1 |= yi ^ P[i];
+                t2 |= yi ^ Order8_y1[i];
+                t3 |= yi ^ Order8_y2[i];
+            }
+
+            uint y0 = Codec.Decode32(p);
+
+            // Reject 0 and 1
+            if (t0 == 0 && y0 <= 1U)
+                return false;
+
+            // Reject P - 1 and non-canonical encodings (i.e. >= P)
+            if (t1 == 0 && y0 >= (P[0] - 1U))
+                return false;
+
+            t2 |= y0 ^ Order8_y1[0];
+            t3 |= y0 ^ Order8_y2[0];
+
+            // Reject order 8 points
+            return (t2 != 0) & (t3 != 0);
+        }
+#else
         private static bool CheckPointFullVar(byte[] p)
         {
             uint y7 = Codec.Decode32(p, 28) & 0x7FFFFFFFU;
@@ -263,19 +289,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             // Reject order 8 points
             return (t2 != 0) & (t3 != 0);
         }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static bool CheckScalarVar(ReadOnlySpan<byte> s, Span<uint> n)
-        {
-            DecodeScalar(s, n);
-            return !Nat.Gte(ScalarUints, n, L);
-        }
-#else
-        private static bool CheckScalarVar(byte[] s, uint[] n)
-        {
-            DecodeScalar(s, 0, n);
-            return !Nat256.Gte(n, L);
-        }
 #endif
 
         private static byte[] Copy(byte[] buf, int off, int len)
@@ -298,16 +311,15 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             return CreateDigest();
         }
 
-        private static bool DecodePointVar(byte[] p, int pOff, bool negate, ref PointAffine r)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static bool DecodePointVar(ReadOnlySpan<byte> p, bool negate, ref PointAffine r)
+#else
+        private static bool DecodePointVar(byte[] p, bool negate, ref PointAffine r)
+#endif
         {
-            byte[] py = Copy(p, pOff, PointBytes);
-            if (!CheckPointFullVar(py))
-                return false;
+            int x_0 = (p[PointBytes - 1] & 0x80) >> 7;
 
-            int x_0 = (py[PointBytes - 1] & 0x80) >> 7;
-            py[PointBytes - 1] &= 0x7F;
-
-            F.Decode(py, 0, r.y);
+            F.Decode(p, r.y);
 
             int[] u = F.Create();
             int[] v = F.Create();
@@ -332,18 +344,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             return true;
         }
 
-        private static void DecodeScalar(byte[] k, int kOff, uint[] n)
-        {
-            Codec.Decode32(k, kOff, n, 0, ScalarUints);
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void DecodeScalar(ReadOnlySpan<byte> k, Span<uint> n)
-        {
-            Codec.Decode32(k, n[..ScalarUints]);
-        }
-#endif
-
         private static void Dom2(IDigest d, byte phflag, byte[] ctx)
         {
             Debug.Assert(ctx != null);
@@ -475,6 +475,22 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             return (x[w] >> b) & 15U;
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void GroupCombBits(Span<uint> n)
+#else
+        private static void GroupCombBits(uint[] n)
+#endif
+        {
+            /*
+             * Because we are using 4 teeth and 8 spacing, each limb of n corresponds to one of the 8 blocks.
+             * Therefore we can efficiently group the bits for each comb position using a (double) shuffle. 
+             */
+            for (int i = 0; i < n.Length; ++i)
+            {
+                n[i] = Interleave.Shuffle2(n[i]);
+            }
+        }
+
         private static void ImplSign(IDigest d, byte[] h, byte[] s, byte[] pk, int pkOff, byte[] ctx, byte phflag,
             byte[] m, int mOff, int mLen, byte[] sig, int sigOff)
         {
@@ -486,7 +502,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             d.BlockUpdate(m, mOff, mLen);
             d.DoFinal(h, 0);
 
-            byte[] r = ReduceScalar(h);
+            byte[] r = Scalar25519.Reduce(h);
             byte[] R = new byte[PointBytes];
             ScalarMultBaseEncoded(r, R, 0);
 
@@ -499,7 +515,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             d.BlockUpdate(m, mOff, mLen);
             d.DoFinal(h, 0);
 
-            byte[] k = ReduceScalar(h);
+            byte[] k = Scalar25519.Reduce(h);
             byte[] S = CalculateS(r, k, s);
 
             Array.Copy(R, 0, sig, sigOff, PointBytes);
@@ -552,21 +568,30 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 throw new ArgumentException("ctx");
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            Span<byte> RS = stackalloc byte[PointBytes + ScalarBytes];
-            RS.CopyFrom(sig.AsSpan(sigOff, PointBytes + ScalarBytes));
+            Span<byte> signature = stackalloc byte[SignatureSize];
+            signature.CopyFrom(sig.AsSpan(sigOff, SignatureSize));
+            var R = signature[..PointBytes];
+            var S = signature[PointBytes..];
 
-            var R = RS[..PointBytes];
-            var S = RS[PointBytes..];
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
 
             if (!CheckPointVar(R))
                 return false;
 
             Span<uint> nS = stackalloc uint[ScalarUints];
-            if (!CheckScalarVar(S, nS))
+            if (!Scalar25519.CheckVar(S, nS))
+                return false;
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointAffine pR);
+            if (!DecodePointVar(R, true, ref pR))
                 return false;
 
             Init(out PointAffine pA);
-            if (!DecodePointVar(pk, pkOff, true, ref pA))
+            if (!DecodePointVar(A, true, ref pA))
                 return false;
 
             IDigest d = CreateDigest();
@@ -577,34 +602,41 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 Dom2(d, phflag, ctx);
             }
             d.BlockUpdate(R);
-            d.BlockUpdate(pk.AsSpan(pkOff, PointBytes));
+            d.BlockUpdate(A);
             d.BlockUpdate(m.AsSpan(mOff, mLen));
             d.DoFinal(h);
 
             Span<byte> k = stackalloc byte[ScalarBytes];
-            ReduceScalar(h, k);
+            Scalar25519.Reduce(h, k);
 
             Span<uint> nA = stackalloc uint[ScalarUints];
-            DecodeScalar(k, nA);
-
-            Init(out PointAccum pR);
-            ScalarMultStrausVar(nS, nA, ref pA, ref pR);
+            Scalar25519.Decode(k, nA);
 
-            Span<byte> check = stackalloc byte[PointBytes];
-            return 0 != EncodePoint(ref pR, check) && check.SequenceEqual(R);
+            Span<uint> v0 = stackalloc uint[4];
+            Span<uint> v1 = stackalloc uint[4];
+            Scalar25519.ReduceBasisVar(nA, v0, v1);
+            Scalar25519.Multiply128Var(nS, v1, nS);
 #else
             byte[] R = Copy(sig, sigOff, PointBytes);
             byte[] S = Copy(sig, sigOff + PointBytes, ScalarBytes);
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
 
             if (!CheckPointVar(R))
                 return false;
 
             uint[] nS = new uint[ScalarUints];
-            if (!CheckScalarVar(S, nS))
+            if (!Scalar25519.CheckVar(S, nS))
+                return false;
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointAffine pR);
+            if (!DecodePointVar(R, true, ref pR))
                 return false;
 
             Init(out PointAffine pA);
-            if (!DecodePointVar(pk, pkOff, true, ref pA))
+            if (!DecodePointVar(A, true, ref pA))
                 return false;
 
             IDigest d = CreateDigest();
@@ -615,21 +647,29 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 Dom2(d, phflag, ctx);
             }
             d.BlockUpdate(R, 0, PointBytes);
-            d.BlockUpdate(pk, pkOff, PointBytes);
+            d.BlockUpdate(A, 0, PointBytes);
             d.BlockUpdate(m, mOff, mLen);
             d.DoFinal(h, 0);
 
-            byte[] k = ReduceScalar(h);
+            byte[] k = Scalar25519.Reduce(h);
 
             uint[] nA = new uint[ScalarUints];
-            DecodeScalar(k, 0, nA);
-
-            Init(out PointAccum pR);
-            ScalarMultStrausVar(nS, nA, ref pA, ref pR);
+            Scalar25519.Decode(k, nA);
 
-            byte[] check = new byte[PointBytes];
-            return 0 != EncodePoint(ref pR, check, 0) && Arrays.AreEqual(check, R);
+            uint[] v0 = new uint[4];
+            uint[] v1 = new uint[4];
+            Scalar25519.ReduceBasisVar(nA, v0, v1);
+            Scalar25519.Multiply128Var(nS, v1, nS);
 #endif
+
+            Init(out PointAccum pZ);
+            ScalarMultStraus128Var(nS, v0, ref pA, v1, ref pR, ref pZ);
+
+            F.Normalize(pZ.x);
+            F.Normalize(pZ.y);
+            F.Normalize(pZ.z);
+
+            return IsNeutralElementVar(pZ.x, pZ.y, pZ.z);
         }
 
         private static void Init(out PointAccum r)
@@ -1249,332 +1289,20 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 #endif
 
-        private static byte[] ReduceScalar(byte[] n)
-        {
-            byte[] r = new byte[ScalarBytes];
-
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ReduceScalar(n, r);
+        private static void ScalarMult(ReadOnlySpan<byte> k, ref PointAffine p, ref PointAccum r)
 #else
-            long x00 =  Codec.Decode32(n,  0)       & M32L;         // x00:32/--
-            long x01 = (Codec.Decode24(n,  4) << 4) & M32L;         // x01:28/--
-            long x02 =  Codec.Decode32(n,  7)       & M32L;         // x02:32/--
-            long x03 = (Codec.Decode24(n, 11) << 4) & M32L;         // x03:28/--
-            long x04 =  Codec.Decode32(n, 14)       & M32L;         // x04:32/--
-            long x05 = (Codec.Decode24(n, 18) << 4) & M32L;         // x05:28/--
-            long x06 =  Codec.Decode32(n, 21)       & M32L;         // x06:32/--
-            long x07 = (Codec.Decode24(n, 25) << 4) & M32L;         // x07:28/--
-            long x08 =  Codec.Decode32(n, 28)       & M32L;         // x08:32/--
-            long x09 = (Codec.Decode24(n, 32) << 4) & M32L;         // x09:28/--
-            long x10 =  Codec.Decode32(n, 35)       & M32L;         // x10:32/--
-            long x11 = (Codec.Decode24(n, 39) << 4) & M32L;         // x11:28/--
-            long x12 =  Codec.Decode32(n, 42)       & M32L;         // x12:32/--
-            long x13 = (Codec.Decode24(n, 46) << 4) & M32L;         // x13:28/--
-            long x14 =  Codec.Decode32(n, 49)       & M32L;         // x14:32/--
-            long x15 = (Codec.Decode24(n, 53) << 4) & M32L;         // x15:28/--
-            long x16 =  Codec.Decode32(n, 56)       & M32L;         // x16:32/--
-            long x17 = (Codec.Decode24(n, 60) << 4) & M32L;         // x17:28/--
-            long x18 =                 n[63]        & M08L;         // x18:08/--
-            long t;
-
-            //x18 += (x17 >> 28); x17 &= M28L;
-            x09 -= x18 * L0;                            // x09:34/28
-            x10 -= x18 * L1;                            // x10:33/30
-            x11 -= x18 * L2;                            // x11:35/28
-            x12 -= x18 * L3;                            // x12:32/31
-            x13 -= x18 * L4;                            // x13:28/21
-
-            x17 += (x16 >> 28); x16 &= M28L;            // x17:28/--, x16:28/--
-            x08 -= x17 * L0;                            // x08:54/32
-            x09 -= x17 * L1;                            // x09:52/51
-            x10 -= x17 * L2;                            // x10:55/34
-            x11 -= x17 * L3;                            // x11:51/36
-            x12 -= x17 * L4;                            // x12:41/--
-
-            //x16 += (x15 >> 28); x15 &= M28L;
-            x07 -= x16 * L0;                            // x07:54/28
-            x08 -= x16 * L1;                            // x08:54/53
-            x09 -= x16 * L2;                            // x09:55/53
-            x10 -= x16 * L3;                            // x10:55/52
-            x11 -= x16 * L4;                            // x11:51/41
-
-            x15 += (x14 >> 28); x14 &= M28L;            // x15:28/--, x14:28/--
-            x06 -= x15 * L0;                            // x06:54/32
-            x07 -= x15 * L1;                            // x07:54/53
-            x08 -= x15 * L2;                            // x08:56/--
-            x09 -= x15 * L3;                            // x09:55/54
-            x10 -= x15 * L4;                            // x10:55/53
-
-            //x14 += (x13 >> 28); x13 &= M28L;
-            x05 -= x14 * L0;                            // x05:54/28
-            x06 -= x14 * L1;                            // x06:54/53
-            x07 -= x14 * L2;                            // x07:56/--
-            x08 -= x14 * L3;                            // x08:56/51
-            x09 -= x14 * L4;                            // x09:56/--
-
-            x13 += (x12 >> 28); x12 &= M28L;            // x13:28/22, x12:28/--
-            x04 -= x13 * L0;                            // x04:54/49
-            x05 -= x13 * L1;                            // x05:54/53
-            x06 -= x13 * L2;                            // x06:56/--
-            x07 -= x13 * L3;                            // x07:56/52
-            x08 -= x13 * L4;                            // x08:56/52
-
-            x12 += (x11 >> 28); x11 &= M28L;            // x12:28/24, x11:28/--
-            x03 -= x12 * L0;                            // x03:54/49
-            x04 -= x12 * L1;                            // x04:54/51
-            x05 -= x12 * L2;                            // x05:56/--
-            x06 -= x12 * L3;                            // x06:56/52
-            x07 -= x12 * L4;                            // x07:56/53
-
-            x11 += (x10 >> 28); x10 &= M28L;            // x11:29/--, x10:28/--
-            x02 -= x11 * L0;                            // x02:55/32
-            x03 -= x11 * L1;                            // x03:55/--
-            x04 -= x11 * L2;                            // x04:56/55
-            x05 -= x11 * L3;                            // x05:56/52
-            x06 -= x11 * L4;                            // x06:56/53
-
-            x10 += (x09 >> 28); x09 &= M28L;            // x10:29/--, x09:28/--
-            x01 -= x10 * L0;                            // x01:55/28
-            x02 -= x10 * L1;                            // x02:55/54
-            x03 -= x10 * L2;                            // x03:56/55
-            x04 -= x10 * L3;                            // x04:57/--
-            x05 -= x10 * L4;                            // x05:56/53
-
-            x08 += (x07 >> 28); x07 &= M28L;            // x08:56/53, x07:28/--
-            x09 += (x08 >> 28); x08 &= M28L;            // x09:29/25, x08:28/--
-
-            t    = (x08 >> 27) & 1L;
-            x09 += t;                                   // x09:29/26
-
-            x00 -= x09 * L0;                            // x00:55/53
-            x01 -= x09 * L1;                            // x01:55/54
-            x02 -= x09 * L2;                            // x02:57/--
-            x03 -= x09 * L3;                            // x03:57/--
-            x04 -= x09 * L4;                            // x04:57/42
-
-            x01 += (x00 >> 28); x00 &= M28L;
-            x02 += (x01 >> 28); x01 &= M28L;
-            x03 += (x02 >> 28); x02 &= M28L;
-            x04 += (x03 >> 28); x03 &= M28L;
-            x05 += (x04 >> 28); x04 &= M28L;
-            x06 += (x05 >> 28); x05 &= M28L;
-            x07 += (x06 >> 28); x06 &= M28L;
-            x08 += (x07 >> 28); x07 &= M28L;
-            x09  = (x08 >> 28); x08 &= M28L;
-
-            x09 -= t;
-
-            Debug.Assert(x09 == 0L || x09 == -1L);
-
-            x00 += x09 & L0;
-            x01 += x09 & L1;
-            x02 += x09 & L2;
-            x03 += x09 & L3;
-            x04 += x09 & L4;
-
-            x01 += (x00 >> 28); x00 &= M28L;
-            x02 += (x01 >> 28); x01 &= M28L;
-            x03 += (x02 >> 28); x02 &= M28L;
-            x04 += (x03 >> 28); x03 &= M28L;
-            x05 += (x04 >> 28); x04 &= M28L;
-            x06 += (x05 >> 28); x05 &= M28L;
-            x07 += (x06 >> 28); x06 &= M28L;
-            x08 += (x07 >> 28); x07 &= M28L;
-
-            Codec.Encode56((ulong)(x00 | (x01 << 28)), r, 0);
-            Codec.Encode56((ulong)(x02 | (x03 << 28)), r, 7);
-            Codec.Encode56((ulong)(x04 | (x05 << 28)), r, 14);
-            Codec.Encode56((ulong)(x06 | (x07 << 28)), r, 21);
-            Codec.Encode32((uint)x08, r, 28);
-#endif
-
-            return r;
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ReduceScalar(ReadOnlySpan<byte> n, Span<byte> r)
-        {
-            long x00 =  Codec.Decode32(n[ 0..])       & M32L;       // x00:32/--
-            long x01 = (Codec.Decode24(n[ 4..]) << 4) & M32L;       // x01:28/--
-            long x02 =  Codec.Decode32(n[ 7..])       & M32L;       // x02:32/--
-            long x03 = (Codec.Decode24(n[11..]) << 4) & M32L;       // x03:28/--
-            long x04 =  Codec.Decode32(n[14..])       & M32L;       // x04:32/--
-            long x05 = (Codec.Decode24(n[18..]) << 4) & M32L;       // x05:28/--
-            long x06 =  Codec.Decode32(n[21..])       & M32L;       // x06:32/--
-            long x07 = (Codec.Decode24(n[25..]) << 4) & M32L;       // x07:28/--
-            long x08 =  Codec.Decode32(n[28..])       & M32L;       // x08:32/--
-            long x09 = (Codec.Decode24(n[32..]) << 4) & M32L;       // x09:28/--
-            long x10 =  Codec.Decode32(n[35..])       & M32L;       // x10:32/--
-            long x11 = (Codec.Decode24(n[39..]) << 4) & M32L;       // x11:28/--
-            long x12 =  Codec.Decode32(n[42..])       & M32L;       // x12:32/--
-            long x13 = (Codec.Decode24(n[46..]) << 4) & M32L;       // x13:28/--
-            long x14 =  Codec.Decode32(n[49..])       & M32L;       // x14:32/--
-            long x15 = (Codec.Decode24(n[53..]) << 4) & M32L;       // x15:28/--
-            long x16 =  Codec.Decode32(n[56..])       & M32L;       // x16:32/--
-            long x17 = (Codec.Decode24(n[60..]) << 4) & M32L;       // x17:28/--
-            long x18 =                 n[63]          & M08L;       // x18:08/--
-            long t;
-
-            //x18 += (x17 >> 28); x17 &= M28L;
-            x09 -= x18 * L0;                            // x09:34/28
-            x10 -= x18 * L1;                            // x10:33/30
-            x11 -= x18 * L2;                            // x11:35/28
-            x12 -= x18 * L3;                            // x12:32/31
-            x13 -= x18 * L4;                            // x13:28/21
-
-            x17 += (x16 >> 28); x16 &= M28L;            // x17:28/--, x16:28/--
-            x08 -= x17 * L0;                            // x08:54/32
-            x09 -= x17 * L1;                            // x09:52/51
-            x10 -= x17 * L2;                            // x10:55/34
-            x11 -= x17 * L3;                            // x11:51/36
-            x12 -= x17 * L4;                            // x12:41/--
-
-            //x16 += (x15 >> 28); x15 &= M28L;
-            x07 -= x16 * L0;                            // x07:54/28
-            x08 -= x16 * L1;                            // x08:54/53
-            x09 -= x16 * L2;                            // x09:55/53
-            x10 -= x16 * L3;                            // x10:55/52
-            x11 -= x16 * L4;                            // x11:51/41
-
-            x15 += (x14 >> 28); x14 &= M28L;            // x15:28/--, x14:28/--
-            x06 -= x15 * L0;                            // x06:54/32
-            x07 -= x15 * L1;                            // x07:54/53
-            x08 -= x15 * L2;                            // x08:56/--
-            x09 -= x15 * L3;                            // x09:55/54
-            x10 -= x15 * L4;                            // x10:55/53
-
-            //x14 += (x13 >> 28); x13 &= M28L;
-            x05 -= x14 * L0;                            // x05:54/28
-            x06 -= x14 * L1;                            // x06:54/53
-            x07 -= x14 * L2;                            // x07:56/--
-            x08 -= x14 * L3;                            // x08:56/51
-            x09 -= x14 * L4;                            // x09:56/--
-
-            x13 += (x12 >> 28); x12 &= M28L;            // x13:28/22, x12:28/--
-            x04 -= x13 * L0;                            // x04:54/49
-            x05 -= x13 * L1;                            // x05:54/53
-            x06 -= x13 * L2;                            // x06:56/--
-            x07 -= x13 * L3;                            // x07:56/52
-            x08 -= x13 * L4;                            // x08:56/52
-
-            x12 += (x11 >> 28); x11 &= M28L;            // x12:28/24, x11:28/--
-            x03 -= x12 * L0;                            // x03:54/49
-            x04 -= x12 * L1;                            // x04:54/51
-            x05 -= x12 * L2;                            // x05:56/--
-            x06 -= x12 * L3;                            // x06:56/52
-            x07 -= x12 * L4;                            // x07:56/53
-
-            x11 += (x10 >> 28); x10 &= M28L;            // x11:29/--, x10:28/--
-            x02 -= x11 * L0;                            // x02:55/32
-            x03 -= x11 * L1;                            // x03:55/--
-            x04 -= x11 * L2;                            // x04:56/55
-            x05 -= x11 * L3;                            // x05:56/52
-            x06 -= x11 * L4;                            // x06:56/53
-
-            x10 += (x09 >> 28); x09 &= M28L;            // x10:29/--, x09:28/--
-            x01 -= x10 * L0;                            // x01:55/28
-            x02 -= x10 * L1;                            // x02:55/54
-            x03 -= x10 * L2;                            // x03:56/55
-            x04 -= x10 * L3;                            // x04:57/--
-            x05 -= x10 * L4;                            // x05:56/53
-
-            x08 += (x07 >> 28); x07 &= M28L;            // x08:56/53, x07:28/--
-            x09 += (x08 >> 28); x08 &= M28L;            // x09:29/25, x08:28/--
-
-            t    = (x08 >> 27) & 1L;
-            x09 += t;                                   // x09:29/26
-
-            x00 -= x09 * L0;                            // x00:55/53
-            x01 -= x09 * L1;                            // x01:55/54
-            x02 -= x09 * L2;                            // x02:57/--
-            x03 -= x09 * L3;                            // x03:57/--
-            x04 -= x09 * L4;                            // x04:57/42
-
-            x01 += (x00 >> 28); x00 &= M28L;
-            x02 += (x01 >> 28); x01 &= M28L;
-            x03 += (x02 >> 28); x02 &= M28L;
-            x04 += (x03 >> 28); x03 &= M28L;
-            x05 += (x04 >> 28); x04 &= M28L;
-            x06 += (x05 >> 28); x05 &= M28L;
-            x07 += (x06 >> 28); x06 &= M28L;
-            x08 += (x07 >> 28); x07 &= M28L;
-            x09  = (x08 >> 28); x08 &= M28L;
-
-            x09 -= t;
-
-            Debug.Assert(x09 == 0L || x09 == -1L);
-
-            x00 += x09 & L0;
-            x01 += x09 & L1;
-            x02 += x09 & L2;
-            x03 += x09 & L3;
-            x04 += x09 & L4;
-
-            x01 += (x00 >> 28); x00 &= M28L;
-            x02 += (x01 >> 28); x01 &= M28L;
-            x03 += (x02 >> 28); x02 &= M28L;
-            x04 += (x03 >> 28); x03 &= M28L;
-            x05 += (x04 >> 28); x04 &= M28L;
-            x06 += (x05 >> 28); x05 &= M28L;
-            x07 += (x06 >> 28); x06 &= M28L;
-            x08 += (x07 >> 28); x07 &= M28L;
-
-            Codec.Encode56((ulong)(x00 | (x01 << 28)), r);
-            Codec.Encode56((ulong)(x02 | (x03 << 28)), r[7..]);
-            Codec.Encode56((ulong)(x04 | (x05 << 28)), r[14..]);
-            Codec.Encode56((ulong)(x06 | (x07 << 28)), r[21..]);
-            Codec.Encode32((uint)x08, r[28..]);
-        }
-#endif
-
         private static void ScalarMult(byte[] k, ref PointAffine p, ref PointAccum r)
+#endif
         {
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ScalarMult(k.AsSpan(), ref p, ref r);
+            Span<uint> n = stackalloc uint[ScalarUints];
 #else
             uint[] n = new uint[ScalarUints];
-            DecodeScalar(k, 0, n);
-
-            // Recode the scalar into signed-digit form
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);   Debug.Assert(c1 == 0U);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, 1U);             Debug.Assert(c2 == (1U << 31));
-            }
-
-            Init(out PointPrecompZ q);
-            Init(out PointTemp t);
-            int[] table = PointPrecomputeZ(ref p, 8, ref t);
-
-            PointSetNeutral(ref r);
-
-            int w = 63;
-            for (;;)
-            {
-                PointLookupZ(n, w, table, ref q);
-                PointAdd(ref q, ref r, ref t);
-
-                if (--w < 0)
-                    break;
-
-                for (int i = 0; i < 4; ++i)
-                {
-                    PointDouble(ref r);
-                }
-            }
 #endif
-        }
 
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMult(ReadOnlySpan<byte> k, ref PointAffine p, ref PointAccum r)
-        {
-            Span<uint> n = stackalloc uint[ScalarUints];
-            DecodeScalar(k, n);
-
-            // Recode the scalar into signed-digit form
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);   Debug.Assert(c1 == 0U);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, 1U);             Debug.Assert(c2 == (1U << 31));
-            }
+            Scalar25519.Decode(k, n);
+            Scalar25519.ToSignedDigits(n, n);
 
             Init(out PointPrecompZ q);
             Init(out PointTemp t);
@@ -1597,79 +1325,12 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 }
             }
         }
-#endif
 
-        private static void ScalarMultBase(byte[] k, ref PointAccum r)
-        {
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            ScalarMultBase(k.AsSpan(), ref r);
+            private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointAccum r)
 #else
-            // Equivalent (but much slower)
-            //Init(out PointAffine p);
-            //F.Copy(B_x, 0, p.x, 0);
-            //F.Copy(B_y, 0, p.y, 0);
-            //ScalarMult(k, ref p, ref r);
-
-            Precompute();
-
-            uint[] n = new uint[ScalarUints];
-            DecodeScalar(k, 0, n);
-
-            // Recode the scalar into signed-digit form, then group comb bits in each block
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);   Debug.Assert(c1 == 0U);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, 1U);             Debug.Assert(c2 == (1U << 31));
-
-                /*
-                 * Because we are using 4 teeth and 8 spacing, each limb of n corresponds to one of the 8 blocks.
-                 * Therefore we can efficiently group the bits for each comb position using a (double) shuffle. 
-                 */
-                for (int i = 0; i < ScalarUints; ++i)
-                {
-                    n[i] = Interleave.Shuffle2(n[i]);
-                }
-            }
-
-            Init(out PointPrecomp p);
-            Init(out PointTemp t);
-
-            PointSetNeutral(ref r);
-            int resultSign = 0;
-
-            int cOff = (PrecompSpacing - 1) * PrecompTeeth;
-            for (;;)
-            {
-                for (int b = 0; b < PrecompBlocks; ++b)
-                {
-                    uint w = n[b] >> cOff;
-                    int sign = (int)(w >> (PrecompTeeth - 1)) & 1;
-                    int abs = ((int)w ^ -sign) & PrecompMask;
-
-                    Debug.Assert(sign == 0 || sign == 1);
-                    Debug.Assert(0 <= abs && abs < PrecompPoints);
-
-                    PointLookup(b, abs, ref p);
-
-                    F.CNegate(resultSign ^ sign, r.x);
-                    F.CNegate(resultSign ^ sign, r.u);
-                    resultSign = sign;
-
-                    PointAdd(ref p, ref r, ref t);
-                }
-
-                if ((cOff -= PrecompTeeth) < 0)
-                    break;
-
-                PointDouble(ref r);
-            }
-
-            F.CNegate(resultSign, r.x);
-            F.CNegate(resultSign, r.u);
+        private static void ScalarMultBase(byte[] k, ref PointAccum r)
 #endif
-        }
-
-#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMultBase(ReadOnlySpan<byte> k, ref PointAccum r)
         {
             // Equivalent (but much slower)
             //Init(out PointAffine p);
@@ -1679,23 +1340,15 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
             Precompute();
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
             Span<uint> n = stackalloc uint[ScalarUints];
-            DecodeScalar(k, n);
+#else
+            uint[] n = new uint[ScalarUints];
+#endif
 
-            // Recode the scalar into signed-digit form, then group comb bits in each block
-            {
-                uint c1 = Nat.CAdd(ScalarUints, ~(int)n[0] & 1, n, L, n);   Debug.Assert(c1 == 0U);
-                uint c2 = Nat.ShiftDownBit(ScalarUints, n, 1U);             Debug.Assert(c2 == (1U << 31));
-
-                /*
-                 * Because we are using 4 teeth and 8 spacing, each limb of n corresponds to one of the 8 blocks.
-                 * Therefore we can efficiently group the bits for each comb position using a (double) shuffle. 
-                 */
-                for (int i = 0; i < ScalarUints; ++i)
-                {
-                    n[i] = Interleave.Shuffle2(n[i]);
-                }
-            }
+            Scalar25519.Decode(k, n);
+            Scalar25519.ToSignedDigits(n, n);
+            GroupCombBits(n);
 
             Init(out PointPrecomp p);
             Init(out PointTemp t);
@@ -1733,7 +1386,6 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
             F.CNegate(resultSign, r.x);
             F.CNegate(resultSign, r.u);
         }
-#endif
 
         private static void ScalarMultBaseEncoded(byte[] k, byte[] r, int rOff)
         {
@@ -1800,7 +1452,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 #else
             sbyte[] ws_p = new sbyte[253];
 #endif
-            Wnaf.GetSignedVar(L, WnafWidth, ws_p);
+            Scalar25519.GetOrderWnafVar(WnafWidth, ws_p);
 
             int count = 1 << (WnafWidth - 2);
             PointPrecompZ[] tp = new PointPrecompZ[count];
@@ -1826,39 +1478,43 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
         }
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-        private static void ScalarMultStrausVar(ReadOnlySpan<uint> nb, ReadOnlySpan<uint> np, ref PointAffine p,
-            ref PointAccum r)
+        private static void ScalarMultStraus128Var(ReadOnlySpan<uint> nb, ReadOnlySpan<uint> np, ref PointAffine p,
+            ReadOnlySpan<uint> nq, ref PointAffine q, ref PointAccum r)
 #else
-        private static void ScalarMultStrausVar(uint[] nb, uint[] np, ref PointAffine p, ref PointAccum r)
+        private static void ScalarMultStraus128Var(uint[] nb, uint[] np, ref PointAffine p, uint[] nq,
+            ref PointAffine q, ref PointAccum r)
 #endif
         {
             Debug.Assert(nb.Length == ScalarUints);
-            Debug.Assert(nb[ScalarUints - 1] <= L[ScalarUints - 1]);
-
-            Debug.Assert(np.Length == ScalarUints);
-            Debug.Assert(np[ScalarUints - 1] <= L[ScalarUints - 1]);
+            Debug.Assert(np.Length == 4);
+            Debug.Assert(nq.Length == 4);
 
             Precompute();
 
 #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
-            Span<sbyte> ws_b = stackalloc sbyte[253];
-            Span<sbyte> ws_p = stackalloc sbyte[253];
+            Span<sbyte> ws_b = stackalloc sbyte[256];
+            Span<sbyte> ws_p = stackalloc sbyte[128];
+            Span<sbyte> ws_q = stackalloc sbyte[128];
 #else
-            sbyte[] ws_b = new sbyte[253];
-            sbyte[] ws_p = new sbyte[253];
+            sbyte[] ws_b = new sbyte[256];
+            sbyte[] ws_p = new sbyte[128];
+            sbyte[] ws_q = new sbyte[128];
 #endif
 
             Wnaf.GetSignedVar(nb, WnafWidthBase, ws_b);
-            Wnaf.GetSignedVar(np, WnafWidth, ws_p);
+            Wnaf.GetSignedVar(np, WnafWidth - 1, ws_p);
+            Wnaf.GetSignedVar(nq, WnafWidth - 1, ws_q);
 
-            int count = 1 << (WnafWidth - 2);
+            int count = 1 << (WnafWidth - 3);
             PointPrecompZ[] tp = new PointPrecompZ[count];
+            PointPrecompZ[] tq = new PointPrecompZ[count];
             Init(out PointTemp t);
             PointPrecomputeZ(ref p, tp, count, ref t);
+            PointPrecomputeZ(ref q, tq, count, ref t);
 
             PointSetNeutral(ref r);
 
-            for (int bit = 252;;)
+            for (int bit = 127; bit >= 0; --bit)
             {
                 int wb = ws_b[bit];
                 if (wb != 0)
@@ -1867,6 +1523,13 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                     PointAddVar(wb < 0, ref PrecompBaseWnaf[index], ref r, ref t);
                 }
 
+                int wb128 = ws_b[128 + bit];
+                if (wb128 != 0)
+                {
+                    int index = (wb128 >> 1) ^ (wb128 >> 31);
+                    PointAddVar(wb128 < 0, ref PrecompBase128Wnaf[index], ref r, ref t);
+                }
+
                 int wp = ws_p[bit];
                 if (wp != 0)
                 {
@@ -1874,11 +1537,19 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                     PointAddVar(wp < 0, ref tp[index], ref r, ref t);
                 }
 
-                if (--bit < 0)
-                    break;
+                int wq = ws_q[bit];
+                if (wq != 0)
+                {
+                    int index = (wq >> 1) ^ (wq >> 31);
+                    PointAddVar(wq < 0, ref tq[index], ref r, ref t);
+                }
 
                 PointDouble(ref r);
             }
+
+            // NOTE: Together with the final PointDouble of the loop, this clears the cofactor of 8
+            PointDouble(ref r);
+            PointDouble(ref r);
         }
 
         public static void Sign(byte[] sk, int skOff, byte[] m, int mOff, int mLen, byte[] sig, int sigOff)
@@ -1949,24 +1620,44 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
 
         public static bool ValidatePublicKeyFull(byte[] pk, int pkOff)
         {
-            Init(out PointAffine p);
-            if (!DecodePointVar(pk, pkOff, false, ref p))
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
+#else
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
+#endif
+
+            if (!CheckPointFullVar(A))
                 return false;
 
-            Init(out PointAccum r);
-            ScalarMultOrderVar(ref p, ref r);
+            Init(out PointAffine pA);
+            if (!DecodePointVar(A, false, ref pA))
+                return false;
 
-            F.Normalize(r.x);
-            F.Normalize(r.y);
-            F.Normalize(r.z);
+            Init(out PointAccum pR);
+            ScalarMultOrderVar(ref pA, ref pR);
+
+            F.Normalize(pR.x);
+            F.Normalize(pR.y);
+            F.Normalize(pR.z);
 
-            return IsNeutralElementVar(r.x, r.y, r.z);
+            return IsNeutralElementVar(pR.x, pR.y, pR.z);
         }
 
         public static bool ValidatePublicKeyPartial(byte[] pk, int pkOff)
         {
-            Init(out PointAffine p);
-            return DecodePointVar(pk, pkOff, false, ref p);
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<byte> A = stackalloc byte[PublicKeySize];
+            A.CopyFrom(pk.AsSpan(pkOff));
+#else
+            byte[] A = Copy(pk, pkOff, PublicKeySize);
+#endif
+
+            if (!CheckPointFullVar(A))
+                return false;
+
+            Init(out PointAffine pA);
+            return DecodePointVar(A, false, ref pA);
         }
 
         public static bool Verify(byte[] sig, int sigOff, byte[] pk, int pkOff, byte[] m, int mOff, int mLen)
diff --git a/crypto/src/math/ec/rfc8032/Scalar25519.cs b/crypto/src/math/ec/rfc8032/Scalar25519.cs
new file mode 100644
index 000000000..738ce63cb
--- /dev/null
+++ b/crypto/src/math/ec/rfc8032/Scalar25519.cs
@@ -0,0 +1,782 @@
+using System;
+using System.Diagnostics;
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#endif
+
+using Org.BouncyCastle.Crypto.Utilities;
+using Org.BouncyCastle.Math.Raw;
+using Org.BouncyCastle.Utilities;
+
+namespace Org.BouncyCastle.Math.EC.Rfc8032
+{
+    internal static class Scalar25519
+    {
+        internal const int Size = 8;
+
+        private const long M08L = 0x000000FFL;
+        private const long M28L = 0x0FFFFFFFL;
+        private const long M32L = 0xFFFFFFFFL;
+
+        private const int TargetLength = 254;
+
+        private static readonly uint[] L = { 0x5CF5D3EDU, 0x5812631AU, 0xA2F79CD6U, 0x14DEF9DEU, 0x00000000U,
+            0x00000000U, 0x00000000U, 0x10000000U };
+        private static readonly uint[] LSq = { 0xAB128969U, 0xE2EDF685U, 0x2298A31DU, 0x68039276U, 0xD217F5BEU,
+            0x3DCEEC73U, 0x1B7C309AU, 0xA1B39941U, 0x4B9EBA7DU, 0xCB024C63U, 0xD45EF39AU, 0x029BDF3BU, 0x00000000U,
+            0x00000000U, 0x00000000U, 0x01000000U };
+
+        private const int L0 = -0x030A2C13;     // L0:26/--
+        private const int L1 =  0x012631A6;     // L1:24/22
+        private const int L2 =  0x079CD658;     // L2:27/--
+        private const int L3 = -0x006215D1;     // L3:23/--
+        private const int L4 =  0x000014DF;     // L4:12/11
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static bool CheckVar(ReadOnlySpan<byte> s, Span<uint> n)
+        {
+            Decode(s, n);
+            return !Nat.Gte(Size, n, L);
+        }
+#else
+        internal static bool CheckVar(byte[] s, uint[] n)
+        {
+            Decode(s, n);
+            return !Nat256.Gte(n, L);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Decode(ReadOnlySpan<byte> k, Span<uint> n)
+        {
+            Codec.Decode32(k, n[..Size]);
+        }
+#else
+        internal static void Decode(byte[] k, uint[] n)
+        {
+            Codec.Decode32(k, 0, n, 0, Size);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void GetOrderWnafVar(int width, Span<sbyte> ws)
+#else
+        internal static void GetOrderWnafVar(int width, sbyte[] ws)
+#endif
+        {
+            Wnaf.GetSignedVar(L, width, ws);
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Multiply128Var(ReadOnlySpan<uint> x, ReadOnlySpan<uint> y128, Span<uint> z)
+        {
+            Span<uint> tt = stackalloc uint[16];
+            Nat.Mul(y128, x, tt);
+
+            if ((y128[3] >> 31) != 0)
+            {
+                Nat.AddTo(8, L, tt[4..]);
+                Nat.SubFrom(8, x, tt[4..]);
+            }
+
+            Span<byte> r = MemoryMarshal.AsBytes(tt);
+            Reduce(r, r);
+            tt[..8].CopyTo(z);
+        }
+#else
+        internal static void Multiply128Var(uint[] x, uint[] y128, uint[] z)
+        {
+            uint[] tt = new uint[12];
+            Nat.Mul(y128, 0, 4, x, 0, 8, tt, 0);
+
+            if ((y128[3] >> 31) != 0)
+            {
+                Nat256.AddTo(L, 0, tt, 4, 0U);
+                Nat256.SubFrom(x, 0, tt, 4);
+            }
+
+            byte[] bytes = new byte[64];
+            Codec.Encode32(tt, 0, 12, bytes, 0);
+
+            byte[] r = Reduce(bytes);
+            Codec.Decode32(r, 0, z, 0, 8);
+        }
+#endif
+
+        internal static byte[] Reduce(byte[] n)
+        {
+            byte[] r = new byte[64];
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Reduce(n, r);
+#else
+            long x00 =  Codec.Decode32(n,  0)       & M32L;         // x00:32/--
+            long x01 = (Codec.Decode24(n,  4) << 4) & M32L;         // x01:28/--
+            long x02 =  Codec.Decode32(n,  7)       & M32L;         // x02:32/--
+            long x03 = (Codec.Decode24(n, 11) << 4) & M32L;         // x03:28/--
+            long x04 =  Codec.Decode32(n, 14)       & M32L;         // x04:32/--
+            long x05 = (Codec.Decode24(n, 18) << 4) & M32L;         // x05:28/--
+            long x06 =  Codec.Decode32(n, 21)       & M32L;         // x06:32/--
+            long x07 = (Codec.Decode24(n, 25) << 4) & M32L;         // x07:28/--
+            long x08 =  Codec.Decode32(n, 28)       & M32L;         // x08:32/--
+            long x09 = (Codec.Decode24(n, 32) << 4) & M32L;         // x09:28/--
+            long x10 =  Codec.Decode32(n, 35)       & M32L;         // x10:32/--
+            long x11 = (Codec.Decode24(n, 39) << 4) & M32L;         // x11:28/--
+            long x12 =  Codec.Decode32(n, 42)       & M32L;         // x12:32/--
+            long x13 = (Codec.Decode24(n, 46) << 4) & M32L;         // x13:28/--
+            long x14 =  Codec.Decode32(n, 49)       & M32L;         // x14:32/--
+            long x15 = (Codec.Decode24(n, 53) << 4) & M32L;         // x15:28/--
+            long x16 =  Codec.Decode32(n, 56)       & M32L;         // x16:32/--
+            long x17 = (Codec.Decode24(n, 60) << 4) & M32L;         // x17:28/--
+            long x18 =                 n[63]        & M08L;         // x18:08/--
+            long t;
+
+            //x18 += (x17 >> 28); x17 &= M28L;
+            x09 -= x18 * L0;                            // x09:34/28
+            x10 -= x18 * L1;                            // x10:33/30
+            x11 -= x18 * L2;                            // x11:35/28
+            x12 -= x18 * L3;                            // x12:32/31
+            x13 -= x18 * L4;                            // x13:28/21
+
+            x17 += (x16 >> 28); x16 &= M28L;            // x17:28/--, x16:28/--
+            x08 -= x17 * L0;                            // x08:54/32
+            x09 -= x17 * L1;                            // x09:52/51
+            x10 -= x17 * L2;                            // x10:55/34
+            x11 -= x17 * L3;                            // x11:51/36
+            x12 -= x17 * L4;                            // x12:41/--
+
+            //x16 += (x15 >> 28); x15 &= M28L;
+            x07 -= x16 * L0;                            // x07:54/28
+            x08 -= x16 * L1;                            // x08:54/53
+            x09 -= x16 * L2;                            // x09:55/53
+            x10 -= x16 * L3;                            // x10:55/52
+            x11 -= x16 * L4;                            // x11:51/41
+
+            x15 += (x14 >> 28); x14 &= M28L;            // x15:28/--, x14:28/--
+            x06 -= x15 * L0;                            // x06:54/32
+            x07 -= x15 * L1;                            // x07:54/53
+            x08 -= x15 * L2;                            // x08:56/--
+            x09 -= x15 * L3;                            // x09:55/54
+            x10 -= x15 * L4;                            // x10:55/53
+
+            //x14 += (x13 >> 28); x13 &= M28L;
+            x05 -= x14 * L0;                            // x05:54/28
+            x06 -= x14 * L1;                            // x06:54/53
+            x07 -= x14 * L2;                            // x07:56/--
+            x08 -= x14 * L3;                            // x08:56/51
+            x09 -= x14 * L4;                            // x09:56/--
+
+            x13 += (x12 >> 28); x12 &= M28L;            // x13:28/22, x12:28/--
+            x04 -= x13 * L0;                            // x04:54/49
+            x05 -= x13 * L1;                            // x05:54/53
+            x06 -= x13 * L2;                            // x06:56/--
+            x07 -= x13 * L3;                            // x07:56/52
+            x08 -= x13 * L4;                            // x08:56/52
+
+            x12 += (x11 >> 28); x11 &= M28L;            // x12:28/24, x11:28/--
+            x03 -= x12 * L0;                            // x03:54/49
+            x04 -= x12 * L1;                            // x04:54/51
+            x05 -= x12 * L2;                            // x05:56/--
+            x06 -= x12 * L3;                            // x06:56/52
+            x07 -= x12 * L4;                            // x07:56/53
+
+            x11 += (x10 >> 28); x10 &= M28L;            // x11:29/--, x10:28/--
+            x02 -= x11 * L0;                            // x02:55/32
+            x03 -= x11 * L1;                            // x03:55/--
+            x04 -= x11 * L2;                            // x04:56/55
+            x05 -= x11 * L3;                            // x05:56/52
+            x06 -= x11 * L4;                            // x06:56/53
+
+            x10 += (x09 >> 28); x09 &= M28L;            // x10:29/--, x09:28/--
+            x01 -= x10 * L0;                            // x01:55/28
+            x02 -= x10 * L1;                            // x02:55/54
+            x03 -= x10 * L2;                            // x03:56/55
+            x04 -= x10 * L3;                            // x04:57/--
+            x05 -= x10 * L4;                            // x05:56/53
+
+            x08 += (x07 >> 28); x07 &= M28L;            // x08:56/53, x07:28/--
+            x09 += (x08 >> 28); x08 &= M28L;            // x09:29/25, x08:28/--
+
+            t    = (x08 >> 27) & 1L;
+            x09 += t;                                   // x09:29/26
+
+            x00 -= x09 * L0;                            // x00:55/53
+            x01 -= x09 * L1;                            // x01:55/54
+            x02 -= x09 * L2;                            // x02:57/--
+            x03 -= x09 * L3;                            // x03:57/--
+            x04 -= x09 * L4;                            // x04:57/42
+
+            x01 += (x00 >> 28); x00 &= M28L;
+            x02 += (x01 >> 28); x01 &= M28L;
+            x03 += (x02 >> 28); x02 &= M28L;
+            x04 += (x03 >> 28); x03 &= M28L;
+            x05 += (x04 >> 28); x04 &= M28L;
+            x06 += (x05 >> 28); x05 &= M28L;
+            x07 += (x06 >> 28); x06 &= M28L;
+            x08 += (x07 >> 28); x07 &= M28L;
+            x09  = (x08 >> 28); x08 &= M28L;
+
+            x09 -= t;
+
+            Debug.Assert(x09 == 0L || x09 == -1L);
+
+            x00 += x09 & L0;
+            x01 += x09 & L1;
+            x02 += x09 & L2;
+            x03 += x09 & L3;
+            x04 += x09 & L4;
+
+            x01 += (x00 >> 28); x00 &= M28L;
+            x02 += (x01 >> 28); x01 &= M28L;
+            x03 += (x02 >> 28); x02 &= M28L;
+            x04 += (x03 >> 28); x03 &= M28L;
+            x05 += (x04 >> 28); x04 &= M28L;
+            x06 += (x05 >> 28); x05 &= M28L;
+            x07 += (x06 >> 28); x06 &= M28L;
+            x08 += (x07 >> 28); x07 &= M28L;
+
+            Codec.Encode56((ulong)(x00 | (x01 << 28)), r, 0);
+            Codec.Encode56((ulong)(x02 | (x03 << 28)), r, 7);
+            Codec.Encode56((ulong)(x04 | (x05 << 28)), r, 14);
+            Codec.Encode56((ulong)(x06 | (x07 << 28)), r, 21);
+            Codec.Encode32((uint)x08, r, 28);
+#endif
+
+            return r;
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void Reduce(ReadOnlySpan<byte> n, Span<byte> r)
+        {
+            long x00 =  Codec.Decode32(n[ 0..])       & M32L;       // x00:32/--
+            long x01 = (Codec.Decode24(n[ 4..]) << 4) & M32L;       // x01:28/--
+            long x02 =  Codec.Decode32(n[ 7..])       & M32L;       // x02:32/--
+            long x03 = (Codec.Decode24(n[11..]) << 4) & M32L;       // x03:28/--
+            long x04 =  Codec.Decode32(n[14..])       & M32L;       // x04:32/--
+            long x05 = (Codec.Decode24(n[18..]) << 4) & M32L;       // x05:28/--
+            long x06 =  Codec.Decode32(n[21..])       & M32L;       // x06:32/--
+            long x07 = (Codec.Decode24(n[25..]) << 4) & M32L;       // x07:28/--
+            long x08 =  Codec.Decode32(n[28..])       & M32L;       // x08:32/--
+            long x09 = (Codec.Decode24(n[32..]) << 4) & M32L;       // x09:28/--
+            long x10 =  Codec.Decode32(n[35..])       & M32L;       // x10:32/--
+            long x11 = (Codec.Decode24(n[39..]) << 4) & M32L;       // x11:28/--
+            long x12 =  Codec.Decode32(n[42..])       & M32L;       // x12:32/--
+            long x13 = (Codec.Decode24(n[46..]) << 4) & M32L;       // x13:28/--
+            long x14 =  Codec.Decode32(n[49..])       & M32L;       // x14:32/--
+            long x15 = (Codec.Decode24(n[53..]) << 4) & M32L;       // x15:28/--
+            long x16 =  Codec.Decode32(n[56..])       & M32L;       // x16:32/--
+            long x17 = (Codec.Decode24(n[60..]) << 4) & M32L;       // x17:28/--
+            long x18 =                 n[63]          & M08L;       // x18:08/--
+            long t;
+
+            //x18 += (x17 >> 28); x17 &= M28L;
+            x09 -= x18 * L0;                            // x09:34/28
+            x10 -= x18 * L1;                            // x10:33/30
+            x11 -= x18 * L2;                            // x11:35/28
+            x12 -= x18 * L3;                            // x12:32/31
+            x13 -= x18 * L4;                            // x13:28/21
+
+            x17 += (x16 >> 28); x16 &= M28L;            // x17:28/--, x16:28/--
+            x08 -= x17 * L0;                            // x08:54/32
+            x09 -= x17 * L1;                            // x09:52/51
+            x10 -= x17 * L2;                            // x10:55/34
+            x11 -= x17 * L3;                            // x11:51/36
+            x12 -= x17 * L4;                            // x12:41/--
+
+            //x16 += (x15 >> 28); x15 &= M28L;
+            x07 -= x16 * L0;                            // x07:54/28
+            x08 -= x16 * L1;                            // x08:54/53
+            x09 -= x16 * L2;                            // x09:55/53
+            x10 -= x16 * L3;                            // x10:55/52
+            x11 -= x16 * L4;                            // x11:51/41
+
+            x15 += (x14 >> 28); x14 &= M28L;            // x15:28/--, x14:28/--
+            x06 -= x15 * L0;                            // x06:54/32
+            x07 -= x15 * L1;                            // x07:54/53
+            x08 -= x15 * L2;                            // x08:56/--
+            x09 -= x15 * L3;                            // x09:55/54
+            x10 -= x15 * L4;                            // x10:55/53
+
+            //x14 += (x13 >> 28); x13 &= M28L;
+            x05 -= x14 * L0;                            // x05:54/28
+            x06 -= x14 * L1;                            // x06:54/53
+            x07 -= x14 * L2;                            // x07:56/--
+            x08 -= x14 * L3;                            // x08:56/51
+            x09 -= x14 * L4;                            // x09:56/--
+
+            x13 += (x12 >> 28); x12 &= M28L;            // x13:28/22, x12:28/--
+            x04 -= x13 * L0;                            // x04:54/49
+            x05 -= x13 * L1;                            // x05:54/53
+            x06 -= x13 * L2;                            // x06:56/--
+            x07 -= x13 * L3;                            // x07:56/52
+            x08 -= x13 * L4;                            // x08:56/52
+
+            x12 += (x11 >> 28); x11 &= M28L;            // x12:28/24, x11:28/--
+            x03 -= x12 * L0;                            // x03:54/49
+            x04 -= x12 * L1;                            // x04:54/51
+            x05 -= x12 * L2;                            // x05:56/--
+            x06 -= x12 * L3;                            // x06:56/52
+            x07 -= x12 * L4;                            // x07:56/53
+
+            x11 += (x10 >> 28); x10 &= M28L;            // x11:29/--, x10:28/--
+            x02 -= x11 * L0;                            // x02:55/32
+            x03 -= x11 * L1;                            // x03:55/--
+            x04 -= x11 * L2;                            // x04:56/55
+            x05 -= x11 * L3;                            // x05:56/52
+            x06 -= x11 * L4;                            // x06:56/53
+
+            x10 += (x09 >> 28); x09 &= M28L;            // x10:29/--, x09:28/--
+            x01 -= x10 * L0;                            // x01:55/28
+            x02 -= x10 * L1;                            // x02:55/54
+            x03 -= x10 * L2;                            // x03:56/55
+            x04 -= x10 * L3;                            // x04:57/--
+            x05 -= x10 * L4;                            // x05:56/53
+
+            x08 += (x07 >> 28); x07 &= M28L;            // x08:56/53, x07:28/--
+            x09 += (x08 >> 28); x08 &= M28L;            // x09:29/25, x08:28/--
+
+            t    = (x08 >> 27) & 1L;
+            x09 += t;                                   // x09:29/26
+
+            x00 -= x09 * L0;                            // x00:55/53
+            x01 -= x09 * L1;                            // x01:55/54
+            x02 -= x09 * L2;                            // x02:57/--
+            x03 -= x09 * L3;                            // x03:57/--
+            x04 -= x09 * L4;                            // x04:57/42
+
+            x01 += (x00 >> 28); x00 &= M28L;
+            x02 += (x01 >> 28); x01 &= M28L;
+            x03 += (x02 >> 28); x02 &= M28L;
+            x04 += (x03 >> 28); x03 &= M28L;
+            x05 += (x04 >> 28); x04 &= M28L;
+            x06 += (x05 >> 28); x05 &= M28L;
+            x07 += (x06 >> 28); x06 &= M28L;
+            x08 += (x07 >> 28); x07 &= M28L;
+            x09  = (x08 >> 28); x08 &= M28L;
+
+            x09 -= t;
+
+            Debug.Assert(x09 == 0L || x09 == -1L);
+
+            x00 += x09 & L0;
+            x01 += x09 & L1;
+            x02 += x09 & L2;
+            x03 += x09 & L3;
+            x04 += x09 & L4;
+
+            x01 += (x00 >> 28); x00 &= M28L;
+            x02 += (x01 >> 28); x01 &= M28L;
+            x03 += (x02 >> 28); x02 &= M28L;
+            x04 += (x03 >> 28); x03 &= M28L;
+            x05 += (x04 >> 28); x04 &= M28L;
+            x06 += (x05 >> 28); x05 &= M28L;
+            x07 += (x06 >> 28); x06 &= M28L;
+            x08 += (x07 >> 28); x07 &= M28L;
+
+            Codec.Encode56((ulong)(x00 | (x01 << 28)), r);
+            Codec.Encode56((ulong)(x02 | (x03 << 28)), r[7..]);
+            Codec.Encode56((ulong)(x04 | (x05 << 28)), r[14..]);
+            Codec.Encode56((ulong)(x06 | (x07 << 28)), r[21..]);
+            Codec.Encode32((uint)x08, r[28..]);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void ReduceBasisVar(ReadOnlySpan<uint> k, Span<uint> z0, Span<uint> z1)
+        {
+            /*
+             * Split scalar k into two half-size scalars z0 and z1, such that z1 * k == z0 mod L.
+             * 
+             * See https://ia.cr/2020/454 (Pornin).
+             */
+
+            Span<uint> Nu = stackalloc uint[16];    LSq.CopyTo(Nu);
+            Span<uint> Nv = stackalloc uint[16];    Nat.Square(8, k, Nv); Nat.AddWordTo(16, 1U, Nv);
+            Span<uint> p  = stackalloc uint[16];    Nat.Mul(8, L, k, p);
+            Span<uint> u0 = stackalloc uint[4];     u0.CopyFrom(L);
+            Span<uint> u1 = stackalloc uint[4];
+            Span<uint> v0 = stackalloc uint[4];     v0.CopyFrom(k);
+            Span<uint> v1 = stackalloc uint[4];     v1[0] = 1U;
+
+            int last = 15;
+            int len_Nv = GetBitLengthPositive(last, Nv);
+
+            while (len_Nv > TargetLength)
+            {
+                int len_p = GetBitLength(last, p);
+                int s = len_p - len_Nv;
+                s &= ~(s >> 31);
+
+                if ((int)p[last] < 0)
+                {
+                    AddShifted_NP(last, s, Nu, Nv, p);
+                    AddShifted_UV(3, s, u0, u1, v0, v1);
+                }
+                else
+                {
+                    SubShifted_NP(last, s, Nu, Nv, p);
+                    SubShifted_UV(3, s, u0, u1, v0, v1);
+                }
+
+                if (LessThan(last, Nu, Nv))
+                {
+                    Swap(ref u0, ref v0);
+                    Swap(ref u1, ref v1);
+                    Swap(ref Nu, ref Nv);
+
+                    last = len_Nv >> 5;
+                    len_Nv = GetBitLengthPositive(last, Nv);
+                }
+            }
+
+            // v1 * k == v0 mod L
+            v0.CopyTo(z0);
+            v1.CopyTo(z1);
+        }
+#else
+        internal static void ReduceBasisVar(uint[] k, uint[] z0, uint[] z1)
+        {
+            /*
+             * Split scalar k into two half-size scalars z0 and z1, such that z1 * k == z0 mod L.
+             * 
+             * See https://ia.cr/2020/454 (Pornin).
+             */
+
+            uint[] Nu = new uint[16];       Array.Copy(LSq, Nu, 16);
+            uint[] Nv = new uint[16];       Nat.Square(8, k, Nv); Nat.AddWordTo(16, 1U, Nv);
+            uint[] p  = new uint[16];       Nat.Mul(8, L, k, p);
+            uint[] u0 = new uint[4];        Array.Copy(L, u0, 4);
+            uint[] u1 = new uint[4];
+            uint[] v0 = new uint[4];        Array.Copy(k, v0, 4);
+            uint[] v1 = new uint[4];        v1[0] = 1U;
+
+            int last = 15;
+            int len_Nv = GetBitLengthPositive(last, Nv);
+
+            while (len_Nv > TargetLength)
+            {
+                int len_p = GetBitLength(last, p);
+                int s = len_p - len_Nv;
+                s &= ~(s >> 31);
+
+                if ((int)p[last] < 0)
+                {
+                    AddShifted_NP(last, s, Nu, Nv, p);
+                    AddShifted_UV(3, s, u0, u1, v0, v1);
+                }
+                else
+                {
+                    SubShifted_NP(last, s, Nu, Nv, p);
+                    SubShifted_UV(3, s, u0, u1, v0, v1);
+                }
+
+                if (LessThan(last, Nu, Nv))
+                {
+                    Swap(ref u0, ref v0);
+                    Swap(ref u1, ref v1);
+                    Swap(ref Nu, ref Nv);
+
+                    last = len_Nv >> 5;
+                    len_Nv = GetBitLengthPositive(last, Nv);
+                }
+            }
+
+            // v1 * k == v0 mod L
+            Array.Copy(v0, z0, 4);
+            Array.Copy(v1, z1, 4);
+        }
+#endif
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        internal static void ToSignedDigits(ReadOnlySpan<uint> x, Span<uint> z)
+#else
+        internal static void ToSignedDigits(uint[] x, uint[] z)
+#endif
+        {
+            uint c1 = Nat.CAdd(Size, ~(int)x[0] & 1, x, L, z);  Debug.Assert(c1 == 0U);
+            uint c2 = Nat.ShiftDownBit(Size, z, 1U);            Debug.Assert(c2 == (1U << 31));
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void AddShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
+#else
+        private static void AddShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            ulong cc__p = 0UL;
+            ulong cc_Nu = 0UL;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_Nu += Nu[i];
+                    cc_Nu += _p[i - sWords];
+
+                    cc__p += _p[i];
+                    cc__p += Nv[i - sWords];
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    cc_Nu += _p[i - sWords];
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_p = 0U;
+                uint prev_q = 0U;
+                uint prev_v = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_p = _p[i - sWords];
+                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
+                    prev_p = next_p;
+
+                    cc_Nu += Nu[i];
+                    cc_Nu += p_s;
+
+                    uint next_v = Nv[i - sWords];
+                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
+                    prev_v = next_v;
+
+                    cc__p += _p[i];
+                    cc__p += v_s;
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    uint next_q = _p[i - sWords];
+                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
+                    prev_q = next_q;
+
+                    cc_Nu += q_s;
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void AddShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
+            ReadOnlySpan<uint> v1)
+#else
+        private static void AddShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            ulong cc_u0 = 0UL;
+            ulong cc_u1 = 0UL;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 += v0[i - sWords];
+                    cc_u1 += v1[i - sWords];
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_v0 = 0U;
+                uint prev_v1 = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_v0 = v0[i - sWords];
+                    uint next_v1 = v1[i - sWords];
+                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
+                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
+                    prev_v0 = next_v0;
+                    prev_v1 = next_v1;
+
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 += v0_s;
+                    cc_u1 += v1_s;
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetBitLength(int last, ReadOnlySpan<uint> x)
+#else
+        private static int GetBitLength(int last, uint[] x)
+#endif
+        {
+            int i = last;
+            uint sign = (uint)((int)x[i] >> 31);
+            while (i > 0 && x[i] == sign)
+            {
+                --i;
+            }
+            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)(x[i] ^ sign));
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetBitLengthPositive(int last, ReadOnlySpan<uint> x)
+#else
+        private static int GetBitLengthPositive(int last, uint[] x)
+#endif
+        {
+            int i = last;
+            while (i > 0 && x[i] == 0)
+            {
+                --i;
+            }
+            return i * 32 + 32 - Integers.NumberOfLeadingZeros((int)x[i]);
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool LessThan(int last, ReadOnlySpan<uint> x, ReadOnlySpan<uint> y)
+#else
+        private static bool LessThan(int last, uint[] x, uint[] y)
+#endif
+        {
+            int i = last;
+            if ((int)x[i] < (int)y[i])
+                return true;
+            if ((int)x[i] > (int)y[i])
+                return false;
+            while (--i >= 0)
+            {
+                if (x[i] < y[i])
+                    return true;
+                if (x[i] > y[i])
+                    return false;
+            }
+            return false;
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void SubShifted_NP(int last, int s, Span<uint> Nu, ReadOnlySpan<uint> Nv, Span<uint> _p)
+#else
+        private static void SubShifted_NP(int last, int s, uint[] Nu, uint[] Nv, uint[] _p)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            long cc__p = 0L;
+            long cc_Nu = 0L;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_Nu += Nu[i];
+                    cc_Nu -= _p[i - sWords];
+
+                    cc__p += _p[i];
+                    cc__p -= Nv[i - sWords];
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    cc_Nu -= _p[i - sWords];
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_p = 0U;
+                uint prev_q = 0U;
+                uint prev_v = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_p = _p[i - sWords];
+                    uint p_s = (next_p << sBits) | (prev_p >> -sBits);
+                    prev_p = next_p;
+
+                    cc_Nu += Nu[i];
+                    cc_Nu -= p_s;
+
+                    uint next_v = Nv[i - sWords];
+                    uint v_s = (next_v << sBits) | (prev_v >> -sBits);
+                    prev_v = next_v;
+
+                    cc__p += _p[i];
+                    cc__p -= v_s;
+                    _p[i]  = (uint)cc__p; cc__p >>= 32;
+
+                    uint next_q = _p[i - sWords];
+                    uint q_s = (next_q << sBits) | (prev_q >> -sBits);
+                    prev_q = next_q;
+
+                    cc_Nu -= q_s;
+                    Nu[i]  = (uint)cc_Nu; cc_Nu >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void SubShifted_UV(int last, int s, Span<uint> u0, Span<uint> u1, ReadOnlySpan<uint> v0,
+            ReadOnlySpan<uint> v1)
+#else
+        private static void SubShifted_UV(int last, int s, uint[] u0, uint[] u1, uint[] v0, uint[] v1)
+#endif
+        {
+            int sWords = s >> 5, sBits = s & 31;
+
+            long cc_u0 = 0L;
+            long cc_u1 = 0L;
+
+            if (sBits == 0)
+            {
+                for (int i = sWords; i <= last; ++i)
+                {
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 -= v0[i - sWords];
+                    cc_u1 -= v1[i - sWords];
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+            else
+            {
+                uint prev_v0 = 0U;
+                uint prev_v1 = 0U;
+
+                for (int i = sWords; i <= last; ++i)
+                {
+                    uint next_v0 = v0[i - sWords];
+                    uint next_v1 = v1[i - sWords];
+                    uint v0_s = (next_v0 << sBits) | (prev_v0 >> -sBits);
+                    uint v1_s = (next_v1 << sBits) | (prev_v1 >> -sBits);
+                    prev_v0 = next_v0;
+                    prev_v1 = next_v1;
+
+                    cc_u0 += u0[i];
+                    cc_u1 += u1[i];
+                    cc_u0 -= v0_s;
+                    cc_u1 -= v1_s;
+                    u0[i]  = (uint)cc_u0; cc_u0 >>= 32;
+                    u1[i]  = (uint)cc_u1; cc_u1 >>= 32;
+                }
+            }
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void Swap(ref Span<uint> x, ref Span<uint> y)
+#else
+        private static void Swap(ref uint[] x, ref uint[] y)
+#endif
+        {
+            var t = x; x = y; y = t;
+        }
+    }
+}
diff --git a/crypto/src/math/ec/rfc8032/Wnaf.cs b/crypto/src/math/ec/rfc8032/Wnaf.cs
index cc6e3704f..1b7d1465d 100644
--- a/crypto/src/math/ec/rfc8032/Wnaf.cs
+++ b/crypto/src/math/ec/rfc8032/Wnaf.cs
@@ -59,7 +59,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032
                 }
             }
 
-            Debug.Assert(sign == n[n.Length - 1] >> 31);
+            Debug.Assert((int)sign == (int)n[n.Length - 1] >> 31);
         }
     }
 }
diff --git a/crypto/test/src/math/ec/rfc8032/test/Ed25519Test.cs b/crypto/test/src/math/ec/rfc8032/test/Ed25519Test.cs
index d042aff9f..90fc24a4c 100644
--- a/crypto/test/src/math/ec/rfc8032/test/Ed25519Test.cs
+++ b/crypto/test/src/math/ec/rfc8032/test/Ed25519Test.cs
@@ -542,8 +542,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032.Tests
         [Test]
         public void TamingVector_04()
         {
-            // TODO Algorithm 2 accepts this (cofactored verification)
-            ImplTamingVector(4, false,
+            ImplTamingVector(4, true,
                 "e47d62c63f830dc7a6851a0b1f33ae4bb2f507fb6cffec4011eaccd55b53f56c",
                 "cdb267ce40c5cd45306fa5d2f29731459387dbf9eb933b7bd5aed9a765b88d4d",
                 "160a1cb0dc9c0258cd0a7d23e94d8fa878bcb1925f2c64246b2dee1796bed512" +
@@ -553,8 +552,7 @@ namespace Org.BouncyCastle.Math.EC.Rfc8032.Tests
         [Test]
         public void TamingVector_05()
         {
-            // TODO Algorithm 2 accepts this (cofactored verification)
-            ImplTamingVector(5, false,
+            ImplTamingVector(5, true,
                 "e47d62c63f830dc7a6851a0b1f33ae4bb2f507fb6cffec4011eaccd55b53f56c",
                 "cdb267ce40c5cd45306fa5d2f29731459387dbf9eb933b7bd5aed9a765b88d4d",
                 "21122a84e0b5fca4052f5b1235c80a537878b38f3142356b2c2384ebad4668b7" +