summary refs log tree commit diff
path: root/crypto/src
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2018-04-17 10:41:26 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2018-04-17 10:41:26 +0700
commitbd1f720cafbcbf30891a286718ee91c44e1c8a4e (patch)
tree5b5a25fd08f0755ef23fb55b61162b45dfed2b42 /crypto/src
parentCache-safety for EC lookup tables (diff)
downloadBouncyCastle.NET-ed25519-bd1f720cafbcbf30891a286718ee91c44e1c8a4e.tar.xz
Add X25519 and X448 from RFC 7748
- includes optimized ladders for base points
Diffstat (limited to 'crypto/src')
-rw-r--r--crypto/src/math/ec/rfc7748/X25519.cs249
-rw-r--r--crypto/src/math/ec/rfc7748/X25519Field.cs520
-rw-r--r--crypto/src/math/ec/rfc7748/X448.cs255
-rw-r--r--crypto/src/math/ec/rfc7748/X448Field.cs904
4 files changed, 1928 insertions, 0 deletions
diff --git a/crypto/src/math/ec/rfc7748/X25519.cs b/crypto/src/math/ec/rfc7748/X25519.cs
new file mode 100644
index 000000000..862b8e0d0
--- /dev/null
+++ b/crypto/src/math/ec/rfc7748/X25519.cs
@@ -0,0 +1,249 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace Org.BouncyCastle.Math.EC.Rfc7748
+{
+    public abstract class X25519
+    {
+        private const int C_A = 486662;
+        private const int C_A24 = (C_A + 2)/4;
+
+        // 0x1
+        //private static readonly int[] S_x = new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+        // 0x215132111D8354CB52385F46DCA2B71D440F6A51EB4D1207816B1E0137D48290
+        private static readonly int[] PsubS_x = new int[]{ 0x03D48290, 0x02C7804D, 0x01207816, 0x028F5A68, 0x00881ED4, 0x00A2B71D,
+            0x0217D1B7, 0x014CB523, 0x0088EC1A, 0x0042A264 };
+
+        private static int[] precompBase = null;
+
+        private static uint Decode32(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            n |= (uint)bs[++off] << 24;
+            return n;
+        }
+
+        private static void DecodeScalar(byte[] k, int kOff, uint[] n)
+        {
+            for (int i = 0; i < 8; ++i)
+            {
+                n[i] = Decode32(k, kOff + i * 4);
+            }
+
+            n[0] &= 0xFFFFFFF8U;
+            n[7] &= 0x7FFFFFFFU;
+            n[7] |= 0x40000000U;
+        }
+
+        private static void PointDouble(int[] x, int[] z)
+        {
+            int[] A = X25519Field.Create();
+            int[] B = X25519Field.Create();
+
+            X25519Field.Apm(x, z, A, B);
+            X25519Field.Sqr(A, A);
+            X25519Field.Sqr(B, B);
+            X25519Field.Mul(A, B, x);
+            X25519Field.Sub(A, B, A);
+            X25519Field.Mul(A, C_A24, z);
+            X25519Field.Add(z, B, z);
+            X25519Field.Mul(z, A, z);
+        }
+
+        [MethodImpl(MethodImplOptions.Synchronized)]
+        public static void Precompute()
+        {
+            if (precompBase != null)
+                return;
+
+            precompBase = new int[X25519Field.Size * 252];
+
+            int[] xs = precompBase;
+            int[] zs = new int[X25519Field.Size * 251];
+
+            int[] x = X25519Field.Create();     x[0] = 9;          
+            int[] z = X25519Field.Create();     z[0] = 1;
+
+            int[] n = X25519Field.Create();
+            int[] d = X25519Field.Create();
+
+            X25519Field.Apm(x, z, n, d);
+
+            int[] c = X25519Field.Create();     X25519Field.Copy(d, 0, c, 0);
+
+            int off = 0;
+            for (;;)
+            {
+                X25519Field.Copy(n, 0, xs, off);
+
+                if (off == (X25519Field.Size * 251))
+                    break;
+
+                PointDouble(x, z);
+
+                X25519Field.Apm(x, z, n, d);
+                X25519Field.Mul(n, c, n);
+                X25519Field.Mul(c, d, c);
+
+                X25519Field.Copy(d, 0, zs, off);
+
+                off += X25519Field.Size;
+            }
+
+            int[] u = X25519Field.Create();
+            X25519Field.Inv(c, u);
+
+            for (;;)
+            {
+                X25519Field.Copy(xs, off, x, 0);
+
+                X25519Field.Mul(x, u, x);
+                //X25519Field.Normalize(x);
+                X25519Field.Copy(x, 0, precompBase, off);
+
+                if (off == 0)
+                    break;
+
+                off -= X25519Field.Size;
+                X25519Field.Copy(zs, off, z, 0);
+                X25519Field.Mul(u, z, u);
+            }
+        }
+
+        public static void ScalarMult(byte[] k, int kOff, byte[] u, int uOff, byte[] r, int rOff)
+        {
+            uint[] n = new uint[8];     DecodeScalar(k, kOff, n);
+
+            int[] x1 = X25519Field.Create();        X25519Field.Decode(u, uOff, x1);
+            int[] x2 = X25519Field.Create();        X25519Field.Copy(x1, 0, x2, 0);
+            int[] z2 = X25519Field.Create();        z2[0] = 1;
+            int[] x3 = X25519Field.Create();        x3[0] = 1;
+            int[] z3 = X25519Field.Create();
+
+            int[] t1 = X25519Field.Create();
+            int[] t2 = X25519Field.Create();
+
+            Debug.Assert(n[7] >> 30 == 1U);
+
+            int bit = 254, swap = 1;
+            do
+            {
+                X25519Field.Apm(x3, z3, t1, x3);
+                X25519Field.Apm(x2, z2, z3, x2);
+                X25519Field.Mul(t1, x2, t1);
+                X25519Field.Mul(x3, z3, x3);
+                X25519Field.Sqr(z3, z3);
+                X25519Field.Sqr(x2, x2);
+
+                X25519Field.Sub(z3, x2, t2);
+                X25519Field.Mul(t2, C_A24, z2);
+                X25519Field.Add(z2, x2, z2);
+                X25519Field.Mul(z2, t2, z2);
+                X25519Field.Mul(x2, z3, x2);
+
+                X25519Field.Apm(t1, x3, x3, z3);
+                X25519Field.Sqr(x3, x3);
+                X25519Field.Sqr(z3, z3);
+                X25519Field.Mul(z3, x1, z3);
+
+                --bit;
+
+                int word = bit >> 5, shift = bit & 0x1F;
+                int kt = (int)(n[word] >> shift) & 1;
+                swap ^= kt;
+                X25519Field.CSwap(swap, x2, x3);
+                X25519Field.CSwap(swap, z2, z3);
+                swap = kt;
+            }
+            while (bit >= 3);
+
+            Debug.Assert(swap == 0);
+
+            for (int i = 0; i < 3; ++i)
+            {
+                PointDouble(x2, z2);
+            }
+
+            X25519Field.Inv(z2, z2);
+            X25519Field.Mul(x2, z2, x2);
+
+            X25519Field.Normalize(x2);
+            X25519Field.Encode(x2, r, rOff);
+        }
+
+        private static void Dump(string label, int[] x)
+        {
+            int[] y = X25519Field.Create();
+            X25519Field.Copy(x, 0, y, 0);
+            X25519Field.Normalize(y);
+
+            byte[] r = new byte[32];
+            X25519Field.Encode(y, r, 0);
+            string val = Org.BouncyCastle.Utilities.Encoders.Hex.ToHexString(r);
+            Console.WriteLine("  " + label + ": " + val);
+        }
+
+        public static void ScalarMultBase(byte[] k, int kOff, byte[] r, int rOff)
+        {
+            Precompute();
+
+            uint[] n = new uint[8];     DecodeScalar(k, kOff, n);
+
+            int[] x0 = X25519Field.Create();
+            //int[] x1 = X25519Field.Create();        X25519Field.Copy(S_x, 0, x1, 0);
+            int[] x1 = X25519Field.Create();        x1[0] = 1;
+            int[] z1 = X25519Field.Create();        z1[0] = 1;        
+            int[] x2 = X25519Field.Create();        X25519Field.Copy(PsubS_x, 0, x2, 0);
+            int[] z2 = X25519Field.Create();        z2[0] = 1;        
+
+            int[] A = x1;
+            int[] B = z1;
+            int[] C = x0;
+            int[] D = A;
+            int[] E = B;
+
+            Debug.Assert(n[7] >> 30 == 1U);
+
+            int off = 0, bit = 3, swap = 1;
+            do
+            {
+                X25519Field.Copy(precompBase, off, x0, 0);
+                off += X25519Field.Size;
+
+                int word = bit >> 5, shift = bit & 0x1F;
+                int kt = (int)(n[word] >> shift) & 1;
+                swap ^= kt;
+                X25519Field.CSwap(swap, x1, x2);
+                X25519Field.CSwap(swap, z1, z2);
+                swap = kt;
+
+                X25519Field.Apm(x1, z1, A, B);
+                X25519Field.Mul(x0, B, C);
+                X25519Field.Carry(A);
+                X25519Field.Apm(A, C, D, E);
+                X25519Field.Sqr(D, D);
+                X25519Field.Sqr(E, E);
+                X25519Field.Mul(z2, D, x1);
+                X25519Field.Mul(x2, E, z1);
+            }
+            while (++bit < 255);
+
+            Debug.Assert(swap == 1);
+
+            for (int i = 0; i < 3; ++i)
+            {
+                PointDouble(x1, z1);
+            }
+
+            X25519Field.Inv(z1, z1);
+            X25519Field.Mul(x1, z1, x1);
+
+            X25519Field.Normalize(x1);
+            X25519Field.Encode(x1, r, rOff);
+        }
+    }
+}
diff --git a/crypto/src/math/ec/rfc7748/X25519Field.cs b/crypto/src/math/ec/rfc7748/X25519Field.cs
new file mode 100644
index 000000000..282f41628
--- /dev/null
+++ b/crypto/src/math/ec/rfc7748/X25519Field.cs
@@ -0,0 +1,520 @@
+using System;
+using System.Diagnostics;
+
+namespace Org.BouncyCastle.Math.EC.Rfc7748
+{
+    public abstract class X25519Field
+    {
+        public const int Size = 10;
+
+        private const int M24 = 0x00FFFFFF;
+        private const int M25 = 0x01FFFFFF;
+        private const int M26 = 0x03FFFFFF;
+
+        private X25519Field() {}
+
+        public static void Add(int[] x, int[] y, int[] z)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[i] = x[i] + y[i];
+            }
+        }
+
+        public static void Apm(int[] x, int[] y, int[] zp, int[] zm)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                int xi = x[i], yi = y[i];
+                zp[i] = xi + yi;
+                zm[i] = xi - yi;
+            }
+        }
+
+        public static void Carry(int[] z)
+        {
+            int z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3], z4 = z[4];
+            int z5 = z[5], z6 = z[6], z7 = z[7], z8 = z[8], z9 = z[9];
+
+            z3 += (z2 >> 25); z2 &= M25;
+            z5 += (z4 >> 25); z4 &= M25;
+            z8 += (z7 >> 25); z7 &= M25;
+            //z0 += (z9 >> 24) * 19; z9 &= M24;
+            z0 += (z9 >> 25) * 38; z9 &= M25;
+
+            z1 += (z0 >> 26); z0 &= M26;
+            z6 += (z5 >> 26); z5 &= M26;
+
+            z2 += (z1 >> 26); z1 &= M26;
+            z4 += (z3 >> 26); z3 &= M26;
+            z7 += (z6 >> 26); z6 &= M26;
+            z9 += (z8 >> 26); z8 &= M26;
+
+            z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
+            z[5] = z5; z[6] = z6; z[7] = z7; z[8] = z8; z[9] = z9;
+        }
+
+        public static void Copy(int[] x, int xOff, int[] z, int zOff)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[zOff + i] = x[xOff + i];
+            }
+        }
+
+        public static int[] Create()
+        {
+            return new int[Size];
+        }
+
+        public static void CSwap(int swap, int[] a, int[] b)
+        {
+            Debug.Assert(swap >> 1 == 0);
+            Debug.Assert(a != b);
+
+            int mask = 0 - swap;
+            for (int i = 0; i < Size; ++i)
+            {
+                int ai = a[i], bi = b[i];
+                int dummy = mask & (ai ^ bi);
+                a[i] = ai ^ dummy; 
+                b[i] = bi ^ dummy; 
+            }
+        }
+
+        public static void Decode(byte[] x, int xOff, int[] z)
+        {
+            Decode128(x, xOff, z, 0);
+            Decode128(x, xOff + 16, z, 5);
+            z[9] &= M24;
+        }
+
+        private static void Decode128(byte[] bs, int off, int[] z, int zOff)
+        {
+            uint t0 = Decode32(bs, off + 0);
+            uint t1 = Decode32(bs, off + 4);
+            uint t2 = Decode32(bs, off + 8);
+            uint t3 = Decode32(bs, off + 12);
+
+            z[zOff + 0] = (int)t0 & M26;
+            z[zOff + 1] = (int)((t1 << 6) | (t0 >> 26)) & M26;
+            z[zOff + 2] = (int)((t2 << 12) | (t1 >> 20)) & M25;
+            z[zOff + 3] = (int)((t3 << 19) | (t2 >> 13)) & M26;
+            z[zOff + 4] = (int)(t3 >> 7);
+        }
+
+        private static uint Decode32(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            n |= (uint)bs[++off] << 24;
+            return n;
+        }
+
+        public static void Encode(int[] x, byte[] z, int zOff)
+        {
+            Encode128(x, 0, z, zOff);
+            Encode128(x, 5, z, zOff + 16);
+        }
+
+        private static void Encode128(int[] x, int xOff, byte[] bs, int off)
+        {
+            uint x0 = (uint)x[xOff + 0], x1 = (uint)x[xOff + 1], x2 = (uint)x[xOff + 2];
+            uint x3 = (uint)x[xOff + 3], x4 = (uint)x[xOff + 4];
+
+            uint t0 =  x0        | (x1 << 26);  Encode32(t0, bs, off + 0);
+            uint t1 = (x1 >>  6) | (x2 << 20);  Encode32(t1, bs, off + 4);
+            uint t2 = (x2 >> 12) | (x3 << 13);  Encode32(t2, bs, off + 8);
+            uint t3 = (x3 >> 19) | (x4 <<  7);  Encode32(t3, bs, off + 12);
+        }
+
+        private static void Encode32(uint n, byte[] bs, int off)
+        {
+            bs[  off] = (byte)(n      );
+            bs[++off] = (byte)(n >>  8);
+            bs[++off] = (byte)(n >> 16);
+            bs[++off] = (byte)(n >> 24);
+        }
+
+        public static void Inv(int[] x, int[] z)
+        {
+            // z = x^(p-2) = x^7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEB
+            // (250 1s) (1 0s) (1 1s) (1 0s) (2 1s)
+            // Addition chain: [1] [2] 3 5 10 15 25 50 75 125 [250]
+
+            int[] x2 = Create();    Sqr(x, x2);             Mul(x, x2, x2);
+            int[] x3 = Create();    Sqr(x2, x3);            Mul(x, x3, x3);
+            int[] x5 = x3;          Sqr(x3, 2, x5);         Mul(x2, x5, x5);
+            int[] x10 = Create();   Sqr(x5, 5, x10);        Mul(x5, x10, x10);
+            int[] x15 = Create();   Sqr(x10, 5, x15);       Mul(x5, x15, x15);
+            int[] x25 = x5;         Sqr(x15, 10, x25);      Mul(x10, x25, x25);
+            int[] x50 = x10;        Sqr(x25, 25, x50);      Mul(x25, x50, x50);
+            int[] x75 = x15;        Sqr(x50, 25, x75);      Mul(x25, x75, x75);
+            int[] x125 = x25;       Sqr(x75, 50, x125);     Mul(x50, x125, x125);
+            int[] x250 = x50;       Sqr(x125, 125, x250);   Mul(x125, x250, x250);
+
+            int[] t = x125;
+            Sqr(x250, 2, t);
+            Mul(t, x, t);
+            Sqr(t, 3, t);
+            Mul(t, x2, z);
+        }
+
+        public static void Mul(int[] x, int y, int[] z)
+        {
+            int x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
+            int x5 = x[5], x6 = x[6], x7 = x[7], x8 = x[8], x9 = x[9];
+            long c0, c1, c2, c3;
+
+            c0  = (long)x2 * y; x2 = (int)c0 & M25; c0 >>= 25;
+            c1  = (long)x4 * y; x4 = (int)c1 & M25; c1 >>= 25;
+            c2  = (long)x7 * y; x7 = (int)c2 & M25; c2 >>= 25;
+            //c3 = (long)x9 * y; x9 = (int)c3 & M24; c3 >>= 24;
+            //c3 *= 19;
+            c3  = (long)x9 * y; x9 = (int)c3 & M25; c3 >>= 25;
+            c3 *= 38;
+
+            c3 += (long)x0 * y; z[0] = (int)c3 & M26; c3 >>= 26;
+            c1 += (long)x5 * y; z[5] = (int)c1 & M26; c1 >>= 26;
+
+            c3 += (long)x1 * y; z[1] = (int)c3 & M26; c3 >>= 26;
+            c0 += (long)x3 * y; z[3] = (int)c0 & M26; c0 >>= 26;
+            c1 += (long)x6 * y; z[6] = (int)c1 & M26; c1 >>= 26;
+            c2 += (long)x8 * y; z[8] = (int)c2 & M26; c2 >>= 26;
+
+            z[2] = x2 + (int)c3;
+            z[4] = x4 + (int)c0;
+            z[7] = x7 + (int)c1;
+            z[9] = x9 + (int)c2;
+        }
+
+        public static void Mul(int[] x, int[] y, int[] z)
+        {
+            int x0 = x[0], y0 = y[0];
+            int x1 = x[1], y1 = y[1];
+            int x2 = x[2], y2 = y[2];
+            int x3 = x[3], y3 = y[3];
+            int x4 = x[4], y4 = y[4];
+
+            int u0 = x[5], v0 = y[5];
+            int u1 = x[6], v1 = y[6];
+            int u2 = x[7], v2 = y[7];
+            int u3 = x[8], v3 = y[8];
+            int u4 = x[9], v4 = y[9];
+
+            long a0  = (long)x0 * y0;
+            long a1  = (long)x0 * y1
+                     + (long)x1 * y0;
+            long a2  = (long)x0 * y2
+                     + (long)x1 * y1
+                     + (long)x2 * y0;
+            long a3  = (long)x1 * y2
+                     + (long)x2 * y1;
+            a3     <<= 1;
+            a3      += (long)x0 * y3
+                     + (long)x3 * y0;
+            long a4  = (long)x2 * y2;
+            a4     <<= 1;
+            a4      += (long)x0 * y4
+                     + (long)x1 * y3
+                     + (long)x3 * y1
+                     + (long)x4 * y0;
+            long a5  = (long)x1 * y4
+                     + (long)x2 * y3
+                     + (long)x3 * y2
+                     + (long)x4 * y1;
+            a5     <<= 1;
+            long a6  = (long)x2 * y4
+                     + (long)x4 * y2;
+            a6     <<= 1;
+            a6      += (long)x3 * y3;
+            long a7  = (long)x3 * y4
+                     + (long)x4 * y3;
+            long a8  = (long)x4 * y4;
+            a8     <<= 1;
+
+            long b0  = (long)u0 * v0;
+            long b1  = (long)u0 * v1
+                     + (long)u1 * v0;
+            long b2  = (long)u0 * v2
+                     + (long)u1 * v1
+                     + (long)u2 * v0;
+            long b3  = (long)u1 * v2
+                     + (long)u2 * v1;
+            b3     <<= 1;
+            b3      += (long)u0 * v3
+                     + (long)u3 * v0;
+            long b4  = (long)u2 * v2;
+            b4     <<= 1;
+            b4      += (long)u0 * v4
+                     + (long)u1 * v3
+                     + (long)u3 * v1
+                     + (long)u4 * v0;
+            long b5  = (long)u1 * v4
+                     + (long)u2 * v3
+                     + (long)u3 * v2
+                     + (long)u4 * v1;
+            //b5     <<= 1;
+            long b6  = (long)u2 * v4
+                     + (long)u4 * v2;
+            b6     <<= 1;
+            b6      += (long)u3 * v3;
+            long b7  = (long)u3 * v4
+                     + (long)u4 * v3;
+            long b8  = (long)u4 * v4;
+            //b8     <<= 1;
+
+            a0 -= b5 * 76;
+            a1 -= b6 * 38;
+            a2 -= b7 * 38;
+            a3 -= b8 * 76;
+
+            a5 -= b0;
+            a6 -= b1;
+            a7 -= b2;
+            a8 -= b3;
+            //long a9 = -b4;
+
+            x0 += u0; y0 += v0;
+            x1 += u1; y1 += v1;
+            x2 += u2; y2 += v2;
+            x3 += u3; y3 += v3;
+            x4 += u4; y4 += v4;
+
+            long c0  = (long)x0 * y0;
+            long c1  = (long)x0 * y1
+                     + (long)x1 * y0;
+            long c2  = (long)x0 * y2
+                     + (long)x1 * y1
+                     + (long)x2 * y0;
+            long c3  = (long)x1 * y2
+                     + (long)x2 * y1;
+            c3     <<= 1;
+            c3      += (long)x0 * y3
+                     + (long)x3 * y0;
+            long c4  = (long)x2 * y2;
+            c4     <<= 1;
+            c4      += (long)x0 * y4
+                     + (long)x1 * y3
+                     + (long)x3 * y1
+                     + (long)x4 * y0;
+            long c5  = (long)x1 * y4
+                     + (long)x2 * y3
+                     + (long)x3 * y2
+                     + (long)x4 * y1;
+            c5     <<= 1;
+            long c6  = (long)x2 * y4
+                     + (long)x4 * y2;
+            c6     <<= 1;
+            c6      += (long)x3 * y3;
+            long c7  = (long)x3 * y4
+                     + (long)x4 * y3;
+            long c8  = (long)x4 * y4;
+            c8     <<= 1;
+
+            int z8, z9;
+            long t;
+
+            t        = a8 + (c3 - a3);
+            z8       = (int)t & M26; t >>= 26;
+            //t       += a9 + (c4 - a4);
+            t       +=      (c4 - a4) - b4;
+            //z9       = (int)t & M24; t >>= 24;
+            //t        = a0 + (t + ((c5 - a5) << 1)) * 19;
+            z9       = (int)t & M25; t >>= 25;
+            t        = a0 + (t + c5 - a5) * 38;
+            z[0]     = (int)t & M26; t >>= 26;
+            t       += a1 + (c6 - a6) * 38;
+            z[1]     = (int)t & M26; t >>= 26;
+            t       += a2 + (c7 - a7) * 38;
+            z[2]     = (int)t & M25; t >>= 25;
+            t       += a3 + (c8 - a8) * 38;
+            z[3]     = (int)t & M26; t >>= 26;
+            //t       += a4 - a9 * 38;
+            t       += a4 + b4 * 38;
+            z[4]     = (int)t & M25; t >>= 25;
+            t       += a5 + (c0 - a0);
+            z[5]     = (int)t & M26; t >>= 26;
+            t       += a6 + (c1 - a1);
+            z[6]     = (int)t & M26; t >>= 26;
+            t       += a7 + (c2 - a2);
+            z[7]     = (int)t & M25; t >>= 25;
+            t       += z8;
+            z[8]     = (int)t & M26; t >>= 26;
+            z[9]     = z9 + (int)t;
+        }
+
+        public static void Normalize(int[] z)
+        {
+            int x = (z[9] >> 23) & 1;
+            Reduce(z, x);
+            Reduce(z, -x);
+            Debug.Assert(z[9] >> 24 == 0);
+        }
+
+        private static void Reduce(int[] z, int c)
+        {
+            int z9 = z[9], t = z9;
+                       z9   = t & M24; t >>= 24;
+            t += c;
+            t *= 19;
+            t += z[0]; z[0] = t & M26; t >>= 26;
+            t += z[1]; z[1] = t & M26; t >>= 26;
+            t += z[2]; z[2] = t & M25; t >>= 25;
+            t += z[3]; z[3] = t & M26; t >>= 26;
+            t += z[4]; z[4] = t & M25; t >>= 25;
+            t += z[5]; z[5] = t & M26; t >>= 26;
+            t += z[6]; z[6] = t & M26; t >>= 26;
+            t += z[7]; z[7] = t & M25; t >>= 25;
+            t += z[8]; z[8] = t & M26; t >>= 26;
+            t += z9;   z[9] = t;
+        }
+
+        public static void Sqr(int[] x, int[] z)
+        {
+            int x0 = x[0];
+            int x1 = x[1];
+            int x2 = x[2];
+            int x3 = x[3];
+            int x4 = x[4];
+
+            int u0 = x[5];
+            int u1 = x[6];
+            int u2 = x[7];
+            int u3 = x[8];
+            int u4 = x[9];
+
+            int x1_2 = x1 * 2;
+            int x2_2 = x2 * 2;
+            int x3_2 = x3 * 2;
+            int x4_2 = x4 * 2;
+
+            long a0  = (long)x0 * x0;
+            long a1  = (long)x0 * x1_2;
+            long a2  = (long)x0 * x2_2
+                     + (long)x1 * x1;
+            long a3  = (long)x1_2 * x2_2
+                     + (long)x0 * x3_2;
+            long a4  = (long)x2 * x2_2
+                     + (long)x0 * x4_2
+                     + (long)x1 * x3_2;
+            long a5  = (long)x1_2 * x4_2
+                     + (long)x2_2 * x3_2;
+            long a6  = (long)x2_2 * x4_2
+                     + (long)x3 * x3;
+            long a7  = (long)x3 * x4_2;
+            long a8  = (long)x4 * x4_2;
+
+            int u1_2 = u1 * 2;
+            int u2_2 = u2 * 2;
+            int u3_2 = u3 * 2;
+            int u4_2 = u4 * 2;
+        
+            long b0  = (long)u0 * u0;
+            long b1  = (long)u0 * u1_2;
+            long b2  = (long)u0 * u2_2
+                     + (long)u1 * u1;
+            long b3  = (long)u1_2 * u2_2
+                     + (long)u0 * u3_2;
+            long b4  = (long)u2 * u2_2
+                     + (long)u0 * u4_2
+                     + (long)u1 * u3_2;
+            long b5  = (long)u1_2 * u4_2
+                     + (long)u2_2 * u3_2;
+            long b6  = (long)u2_2 * u4_2
+                     + (long)u3 * u3;
+            long b7  = (long)u3 * u4_2;
+            long b8  = (long)u4 * u4_2;
+
+            a0 -= b5 * 38;
+            a1 -= b6 * 38;
+            a2 -= b7 * 38;
+            a3 -= b8 * 38;
+
+            a5 -= b0;
+            a6 -= b1;
+            a7 -= b2;
+            a8 -= b3;
+            //long a9 = -b4;
+
+            x0 += u0;
+            x1 += u1;
+            x2 += u2;
+            x3 += u3;
+            x4 += u4;
+
+            x1_2 = x1 * 2;
+            x2_2 = x2 * 2;
+            x3_2 = x3 * 2;
+            x4_2 = x4 * 2;
+
+            long c0  = (long)x0 * x0;
+            long c1  = (long)x0 * x1_2;
+            long c2  = (long)x0 * x2_2
+                     + (long)x1 * x1;
+            long c3  = (long)x1_2 * x2_2
+                     + (long)x0 * x3_2;
+            long c4  = (long)x2 * x2_2
+                     + (long)x0 * x4_2
+                     + (long)x1 * x3_2;
+            long c5  = (long)x1_2 * x4_2
+                     + (long)x2_2 * x3_2;
+            long c6  = (long)x2_2 * x4_2
+                     + (long)x3 * x3;
+            long c7  = (long)x3 * x4_2;
+            long c8  = (long)x4 * x4_2;
+
+            int z8, z9;
+            long t;
+
+            t        = a8 + (c3 - a3);
+            z8       = (int)t & M26; t >>= 26;
+            //t       += a9 + (c4 - a4);
+            t       +=      (c4 - a4) - b4;
+            //z9       = (int)t & M24; t >>= 24;
+            //t        = a0 + (t + ((c5 - a5) << 1)) * 19;
+            z9       = (int)t & M25; t >>= 25;
+            t        = a0 + (t + c5 - a5) * 38;
+            z[0]     = (int)t & M26; t >>= 26;
+            t       += a1 + (c6 - a6) * 38;
+            z[1]     = (int)t & M26; t >>= 26;
+            t       += a2 + (c7 - a7) * 38;
+            z[2]     = (int)t & M25; t >>= 25;
+            t       += a3 + (c8 - a8) * 38;
+            z[3]     = (int)t & M26; t >>= 26;
+            //t       += a4 - a9 * 38;
+            t       += a4 + b4 * 38;
+            z[4]     = (int)t & M25; t >>= 25;
+            t       += a5 + (c0 - a0);
+            z[5]     = (int)t & M26; t >>= 26;
+            t       += a6 + (c1 - a1);
+            z[6]     = (int)t & M26; t >>= 26;
+            t       += a7 + (c2 - a2);
+            z[7]     = (int)t & M25; t >>= 25;
+            t       += z8;
+            z[8]     = (int)t & M26; t >>= 26;
+            z[9]     = z9 + (int)t;
+        }
+
+        public static void Sqr(int[] x, int n, int[] z)
+        {
+            Debug.Assert(n > 0);
+
+            Sqr(x, z);
+
+            while (--n > 0)
+            {
+                Sqr(z, z);
+            }
+        }
+
+        public static void Sub(int[] x, int[] y, int[] z)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[i] = x[i] - y[i];
+            }
+        }
+    }
+}
diff --git a/crypto/src/math/ec/rfc7748/X448.cs b/crypto/src/math/ec/rfc7748/X448.cs
new file mode 100644
index 000000000..32a4a9e2a
--- /dev/null
+++ b/crypto/src/math/ec/rfc7748/X448.cs
@@ -0,0 +1,255 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace Org.BouncyCastle.Math.EC.Rfc7748
+{
+    public abstract class X448
+    {
+        private const uint C_A = 156326;
+        private const uint C_A24 = (C_A + 2)/4;
+
+        // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE
+        private static readonly uint[] S_x = new uint[]{ 0x0FFFFFFEU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU,
+            0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFEU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU,
+            0x0FFFFFFFU };
+
+        // 0xF0FAB725013244423ACF03881AFFEB7BDACDD1031C81B9672954459D84C1F823F1BD65643ACE1B5123AC33FF1C69BAF8ACB1197DC99D2720
+        private static readonly uint[] PsubS_x = new uint[]{ 0x099d2720U, 0x0b1197dcU, 0x09baf8acU, 0x033ff1c6U, 0x0b5123acU,
+            0x0643ace1U, 0x03f1bd65U, 0x084c1f82U, 0x0954459dU, 0x081b9672U, 0x0dd1031cU, 0x0eb7bdacU, 0x03881affU, 0x0423acf0U,
+            0x05013244U, 0x0f0fab72U };
+
+        private static uint[] precompBase = null;
+
+        private static uint Decode32(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            n |= (uint)bs[++off] << 24;
+            return n;
+        }
+
+        private static void DecodeScalar(byte[] k, int kOff, uint[] n)
+        {
+            for (int i = 0; i < 14; ++i)
+            {
+                n[i] = Decode32(k, kOff + i * 4);
+            }
+
+            n[ 0] &= 0xFFFFFFFCU;
+            n[13] |= 0x80000000U;
+        }
+
+        private static void PointDouble(uint[] x, uint[] z)
+        {
+            uint[] A = X448Field.Create();
+            uint[] B = X448Field.Create();
+
+            //X448Field.Apm(x, z, A, B);
+            X448Field.Add(x, z, A);
+            X448Field.Sub(x, z, B);
+            X448Field.Sqr(A, A);
+            X448Field.Sqr(B, B);
+            X448Field.Mul(A, B, x);
+            X448Field.Sub(A, B, A);
+            X448Field.Mul(A, C_A24, z);
+            X448Field.Add(z, B, z);
+            X448Field.Mul(z, A, z);
+        }
+
+        [MethodImpl(MethodImplOptions.Synchronized)]
+        public static void Precompute()
+        {
+            if (precompBase != null)
+                return;
+
+            precompBase = new uint[X448Field.Size * 446];
+
+            uint[] xs = precompBase;
+            uint[] zs = new uint[X448Field.Size * 445];
+
+            uint[] x = X448Field.Create();     x[0] = 5;          
+            uint[] z = X448Field.Create();     z[0] = 1;
+
+            uint[] n = X448Field.Create();
+            uint[] d = X448Field.Create();
+
+            //X448Field.Apm(x, z, n, d);
+            X448Field.Add(x, z, n);
+            X448Field.Sub(x, z, d);
+
+            uint[] c = X448Field.Create();     X448Field.Copy(d, 0, c, 0);
+
+            int off = 0;
+            for (;;)
+            {
+                X448Field.Copy(n, 0, xs, off);
+
+                if (off == (X448Field.Size * 445))
+                    break;
+
+                PointDouble(x, z);
+
+                //X448Field.Apm(x, z, n, d);
+                X448Field.Add(x, z, n);
+                X448Field.Sub(x, z, d);
+                X448Field.Mul(n, c, n);
+                X448Field.Mul(c, d, c);
+
+                X448Field.Copy(d, 0, zs, off);
+
+                off += X448Field.Size;
+            }
+
+            uint[] u = X448Field.Create();
+            X448Field.Inv(c, u);
+
+            for (;;)
+            {
+                X448Field.Copy(xs, off, x, 0);
+
+                X448Field.Mul(x, u, x);
+                //X448Field.Normalize(x);
+                X448Field.Copy(x, 0, precompBase, off);
+
+                if (off == 0)
+                    break;
+
+                off -= X448Field.Size;
+                X448Field.Copy(zs, off, z, 0);
+                X448Field.Mul(u, z, u);
+            }
+        }
+
+        public static void ScalarMult(byte[] k, int kOff, byte[] u, int uOff, byte[] r, int rOff)
+        {
+            uint[] n = new uint[14];    DecodeScalar(k, kOff, n);
+
+            uint[] x1 = X448Field.Create();     X448Field.Decode(u, uOff, x1);
+            uint[] x2 = X448Field.Create();     X448Field.Copy(x1, 0, x2, 0);
+            uint[] z2 = X448Field.Create();     z2[0] = 1;
+            uint[] x3 = X448Field.Create();     x3[0] = 1;
+            uint[] z3 = X448Field.Create();
+
+            uint[] t1 = X448Field.Create();
+            uint[] t2 = X448Field.Create();
+
+            Debug.Assert(n[13] >> 31 == 1U);
+
+            int bit = 447, swap = 1;
+            do
+            {
+                //X448Field.Apm(x3, z3, t1, x3);
+                X448Field.Add(x3, z3, t1);
+                X448Field.Sub(x3, z3, x3);
+                //X448Field.Apm(x2, z2, z3, x2);
+                X448Field.Add(x2, z2, z3);
+                X448Field.Sub(x2, z2, x2);
+
+                X448Field.Mul(t1, x2, t1);
+                X448Field.Mul(x3, z3, x3);
+                X448Field.Sqr(z3, z3);
+                X448Field.Sqr(x2, x2);
+
+                X448Field.Sub(z3, x2, t2);
+                X448Field.Mul(t2, C_A24, z2);
+                X448Field.Add(z2, x2, z2);
+                X448Field.Mul(z2, t2, z2);
+                X448Field.Mul(x2, z3, x2);
+
+                //X448Field.Apm(t1, x3, x3, z3);
+                X448Field.Sub(t1, x3, z3);
+                X448Field.Add(t1, x3, x3);
+                X448Field.Sqr(x3, x3);
+                X448Field.Sqr(z3, z3);
+                X448Field.Mul(z3, x1, z3);
+
+                --bit;
+
+                int word = bit >> 5, shift = bit & 0x1F;
+                int kt = (int)(n[word] >> shift) & 1;
+                swap ^= kt;
+                X448Field.CSwap(swap, x2, x3);
+                X448Field.CSwap(swap, z2, z3);
+                swap = kt;
+            }
+            while (bit >= 2);
+
+            Debug.Assert(swap == 0);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                PointDouble(x2, z2);
+            }
+
+            X448Field.Inv(z2, z2);
+            X448Field.Mul(x2, z2, x2);
+
+            X448Field.Normalize(x2);
+            X448Field.Encode(x2, r, rOff);
+        }
+
+        public static void ScalarMultBase(byte[] k, int kOff, byte[] r, int rOff)
+        {
+            Precompute();
+
+            uint[] n = new uint[14];    DecodeScalar(k, kOff, n);
+
+            uint[] x0 = X448Field.Create();
+            uint[] x1 = X448Field.Create();     X448Field.Copy(S_x, 0, x1, 0);
+            uint[] z1 = X448Field.Create();     z1[0] = 1;        
+            uint[] x2 = X448Field.Create();     X448Field.Copy(PsubS_x, 0, x2, 0);
+            uint[] z2 = X448Field.Create();     z2[0] = 1;
+
+            uint[] A = X448Field.Create();
+            uint[] B = z1;
+            uint[] C = x0;
+            uint[] D = x1;
+            uint[] E = B;
+
+            Debug.Assert(n[13] >> 31 == 1U);
+
+            int off = 0, bit = 2, swap = 1;
+            do
+            {
+                X448Field.Copy(precompBase, off, x0, 0);
+                off += X448Field.Size;
+
+                int word = bit >> 5, shift = bit & 0x1F;
+                int kt = (int)(n[word] >> shift) & 1;
+                swap ^= kt;
+                X448Field.CSwap(swap, x1, x2);
+                X448Field.CSwap(swap, z1, z2);
+                swap = kt;
+
+                //X448Field.Apm(x1, z1, A, B);
+                X448Field.Add(x1, z1, A);
+                X448Field.Sub(x1, z1, B);
+                X448Field.Mul(x0, B, C);
+                X448Field.Carry(A);
+                //X448Field.Apm(A, C, D, E);
+                X448Field.Add(A, C, D);
+                X448Field.Sub(A, C, E);
+                X448Field.Sqr(D, D);
+                X448Field.Sqr(E, E);
+                X448Field.Mul(z2, D, x1);
+                X448Field.Mul(x2, E, z1);
+            }
+            while (++bit < 448);
+
+            Debug.Assert(swap == 1);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                PointDouble(x1, z1);
+            }
+
+            X448Field.Inv(z1, z1);
+            X448Field.Mul(x1, z1, x1);
+
+            X448Field.Normalize(x1);
+            X448Field.Encode(x1, r, rOff);
+        }
+    }
+}
diff --git a/crypto/src/math/ec/rfc7748/X448Field.cs b/crypto/src/math/ec/rfc7748/X448Field.cs
new file mode 100644
index 000000000..0c44f1eb5
--- /dev/null
+++ b/crypto/src/math/ec/rfc7748/X448Field.cs
@@ -0,0 +1,904 @@
+using System;
+using System.Diagnostics;
+
+namespace Org.BouncyCastle.Math.EC.Rfc7748
+{
+    [CLSCompliantAttribute(false)]
+    public abstract class X448Field
+    {
+        public const int Size = 16;
+
+        private const uint M28 = 0x0FFFFFFFU;
+
+        private X448Field() {}
+
+        public static void Add(uint[] x, uint[] y, uint[] z)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[i] = x[i] + y[i];
+            }
+        }
+
+        //public static void Apm(int[] x, int[] y, int[] zp, int[] zm)
+        //{
+        //    for (int i = 0; i < Size; ++i)
+        //    {
+        //        int xi = x[i], yi = y[i];
+        //        zp[i] = xi + yi;
+        //        zm[i] = xi - yi;
+        //    }
+        //}
+
+        public static void Carry(uint[] z)
+        {
+            uint z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3], z4 = z[4], z5 = z[5], z6 = z[6], z7 = z[7];
+            uint z8 = z[8], z9 = z[9], z10 = z[10], z11 = z[11], z12 = z[12], z13 = z[13], z14 = z[14], z15 = z[15];
+
+            z2   += (z1 >> 28); z1 &= M28;
+            z6   += (z5 >> 28); z5 &= M28;
+            z10  += (z9 >> 28); z9 &= M28;
+            z14  += (z13 >> 28); z13 &= M28;
+
+            z3   += (z2 >> 28); z2 &= M28;
+            z7   += (z6 >> 28); z6 &= M28;
+            z11  += (z10 >> 28); z10 &= M28;
+            z15  += (z14 >> 28); z14 &= M28;
+
+            uint t = z15 >> 28; z15 &= M28;
+            z0   += t;
+            z8   += t;
+
+            z4   += (z3 >> 28); z3 &= M28;
+            z8   += (z7 >> 28); z7 &= M28;
+            z12  += (z11 >> 28); z11 &= M28;
+
+            z1   += (z0 >> 28); z0 &= M28;
+            z5   += (z4 >> 28); z4 &= M28;
+            z9   += (z8 >> 28); z8 &= M28;
+            z13  += (z12 >> 28); z12 &= M28;
+
+            z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5; z[6] = z6; z[7] = z7;
+            z[8] = z8; z[9] = z9; z[10] = z10; z[11] = z11; z[12] = z12; z[13] = z13; z[14] = z14; z[15] = z15;
+        }
+
+        public static void Copy(uint[] x, int xOff, uint[] z, int zOff)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[zOff + i] = x[xOff + i];
+            }
+        }
+
+        public static uint[] Create()
+        {
+            return new uint[Size];
+        }
+
+        public static void CSwap(int swap, uint[] a, uint[] b)
+        {
+            Debug.Assert(swap >> 1 == 0);
+            Debug.Assert(a != b);
+
+            uint mask = (uint)(0 - swap);
+            for (int i = 0; i < Size; ++i)
+            {
+                uint ai = a[i], bi = b[i];
+                uint dummy = mask & (ai ^ bi);
+                a[i] = ai ^ dummy; 
+                b[i] = bi ^ dummy; 
+            }
+        }
+
+        public static void Decode(byte[] x, int xOff, uint[] z)
+        {
+            Decode56(x, xOff, z, 0);
+            Decode56(x, xOff + 7, z, 2);
+            Decode56(x, xOff + 14, z, 4);
+            Decode56(x, xOff + 21, z, 6);
+            Decode56(x, xOff + 28, z, 8);
+            Decode56(x, xOff + 35, z, 10);
+            Decode56(x, xOff + 42, z, 12);
+            Decode56(x, xOff + 49, z, 14);
+        }
+
+        private static uint Decode24(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            return n;
+        }
+
+        private static uint Decode32(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            n |= (uint)bs[++off] << 24;
+            return n;
+        }
+
+        private static void Decode56(byte[] bs, int off, uint[] z, int zOff)
+        {
+            uint lo = Decode32(bs, off);
+            uint hi = Decode24(bs, off + 4);
+            z[zOff] = lo & M28;
+            z[zOff + 1] = (lo >> 28) | (hi << 4);
+        }
+
+        public static void Encode(uint[] x, byte[] z, int zOff)
+        {
+            Encode56(x, 0, z, zOff);
+            Encode56(x, 2, z, zOff + 7);
+            Encode56(x, 4, z, zOff + 14);
+            Encode56(x, 6, z, zOff + 21);
+            Encode56(x, 8, z, zOff + 28);
+            Encode56(x, 10, z, zOff + 35);
+            Encode56(x, 12, z, zOff + 42);
+            Encode56(x, 14, z, zOff + 49);
+        }
+
+        private static void Encode24(uint n, byte[] bs, int off)
+        {
+            bs[  off] = (byte)(n      );
+            bs[++off] = (byte)(n >>  8);
+            bs[++off] = (byte)(n >> 16);
+        }
+
+        private static void Encode32(uint n, byte[] bs, int off)
+        {
+            bs[  off] = (byte)(n      );
+            bs[++off] = (byte)(n >>  8);
+            bs[++off] = (byte)(n >> 16);
+            bs[++off] = (byte)(n >> 24);
+        }
+
+        private static void Encode56(uint[] x, int xOff, byte[] bs, int off)
+        {
+            uint lo = x[xOff], hi = x[xOff + 1];
+            Encode32(lo | (hi << 28), bs, off);
+            Encode24(hi >> 4, bs, off + 4);
+        }
+
+        public static void Inv(uint[] x, uint[] z)
+        {
+            // z = x^(p-2) = x^(2^448 - 2^224 - 3)
+            // (223 1s) (1 0s) (222 1s) (1 0s) (1 1s)
+            // Addition chain: [1] 2 3 6 9 18 19 37 74 111 [222] [223]
+            uint[] x2 = Create();    Sqr(x, x2);             Mul(x, x2, x2);
+            uint[] x3 = Create();    Sqr(x2, x3);            Mul(x, x3, x3);
+            uint[] x6 = Create();    Sqr(x3, 3, x6);         Mul(x3, x6, x6);
+            uint[] x9 = Create();    Sqr(x6, 3, x9);         Mul(x3, x9, x9);
+            uint[] x18 = Create();   Sqr(x9, 9, x18);        Mul(x9, x18, x18);
+            uint[] x19 = Create();   Sqr(x18, x19);          Mul(x, x19, x19);
+            uint[] x37 = Create();   Sqr(x19, 18, x37);      Mul(x18, x37, x37);
+            uint[] x74 = Create();   Sqr(x37, 37, x74);      Mul(x37, x74, x74);
+            uint[] x111 = Create();  Sqr(x74, 37, x111);     Mul(x37, x111, x111);
+            uint[] x222 = Create();  Sqr(x111, 111, x222);   Mul(x111, x222, x222);
+            uint[] x223 = Create();  Sqr(x222, x223);        Mul(x, x223, x223);
+
+            uint[] t = Create();
+            Sqr(x223, 223, t);
+            Mul(t, x222, t);
+            Sqr(t, 2, t);
+            Mul(t, x, z);
+        }
+
+        public static void Mul(uint[] x, uint y, uint[] z)
+        {
+            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7];
+            uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15];
+
+            uint z1, z5, z9, z13;
+            ulong c, d, e, f;
+
+            c     = (ulong)x1 * y;
+            z1    = (uint)c & M28; c >>= 28;
+            d     = (ulong)x5 * y;
+            z5    = (uint)d & M28; d >>= 28;
+            e     = (ulong)x9 * y;
+            z9    = (uint)e & M28; e >>= 28;
+            f     = (ulong)x13 * y;
+            z13   = (uint)f & M28; f >>= 28;
+
+            c    += (ulong)x2 * y;
+            z[2]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x6 * y;
+            z[6]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x10 * y;
+            z[10] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x14 * y;
+            z[14] = (uint)f & M28; f >>= 28;
+
+            c    += (ulong)x3 * y;
+            z[3]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x7 * y;
+            z[7]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x11 * y;
+            z[11] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x15 * y;
+            z[15] = (uint)f & M28; f >>= 28;
+
+            d    += f;
+
+            c    += (ulong)x4 * y;
+            z[4]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x8 * y;
+            z[8]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x12 * y;
+            z[12] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x0 * y;
+            z[0]  = (uint)f & M28; f >>= 28;
+
+            z[1]  = z1 + (uint)f;
+            z[5]  = z5 + (uint)c;
+            z[9]  = z9 + (uint)d;
+            z[13] = z13 + (uint)e;
+        }
+
+        public static void Mul(uint[] x, uint[] y, uint[] z)
+        {
+            uint x0 = x[0];
+            uint x1 = x[1];
+            uint x2 = x[2];
+            uint x3 = x[3];
+            uint x4 = x[4];
+            uint x5 = x[5];
+            uint x6 = x[6];
+            uint x7 = x[7];
+
+            uint u0 = x[8];
+            uint u1 = x[9];
+            uint u2 = x[10];
+            uint u3 = x[11];
+            uint u4 = x[12];
+            uint u5 = x[13];
+            uint u6 = x[14];
+            uint u7 = x[15];
+
+            uint y0 = y[0];
+            uint y1 = y[1];
+            uint y2 = y[2];
+            uint y3 = y[3];
+            uint y4 = y[4];
+            uint y5 = y[5];
+            uint y6 = y[6];
+            uint y7 = y[7];
+
+            uint v0 = y[8];
+            uint v1 = y[9];
+            uint v2 = y[10];
+            uint v3 = y[11];
+            uint v4 = y[12];
+            uint v5 = y[13];
+            uint v6 = y[14];
+            uint v7 = y[15];
+
+            uint s0 = x0 + u0;
+            uint s1 = x1 + u1;
+            uint s2 = x2 + u2;
+            uint s3 = x3 + u3;
+            uint s4 = x4 + u4;
+            uint s5 = x5 + u5;
+            uint s6 = x6 + u6;
+            uint s7 = x7 + u7;
+
+            uint t0 = y0 + v0;
+            uint t1 = y1 + v1;
+            uint t2 = y2 + v2;
+            uint t3 = y3 + v3;
+            uint t4 = y4 + v4;
+            uint t5 = y5 + v5;
+            uint t6 = y6 + v6;
+            uint t7 = y7 + v7;
+
+            uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15;
+            ulong c, d;
+
+            ulong f0  = (ulong)x0 * y0;
+            ulong f8  = (ulong)x7 * y1
+                      + (ulong)x6 * y2
+                      + (ulong)x5 * y3
+                      + (ulong)x4 * y4
+                      + (ulong)x3 * y5
+                      + (ulong)x2 * y6
+                      + (ulong)x1 * y7;
+            ulong g0  = (ulong)u0 * v0;
+            ulong g8  = (ulong)u7 * v1
+                      + (ulong)u6 * v2
+                      + (ulong)u5 * v3
+                      + (ulong)u4 * v4
+                      + (ulong)u3 * v5
+                      + (ulong)u2 * v6
+                      + (ulong)u1 * v7;
+            ulong h0  = (ulong)s0 * t0;
+            ulong h8  = (ulong)s7 * t1
+                      + (ulong)s6 * t2
+                      + (ulong)s5 * t3
+                      + (ulong)s4 * t4
+                      + (ulong)s3 * t5
+                      + (ulong)s2 * t6
+                      + (ulong)s1 * t7;
+
+            c         = f0 + g0 + h8 - f8;
+            z0        = (uint)c & M28; c >>= 28;
+            d         = g8 + h0 - f0 + h8;
+            z8        = (uint)d & M28; d >>= 28;
+
+            ulong f1  = (ulong)x1 * y0
+                      + (ulong)x0 * y1;
+            ulong f9  = (ulong)x7 * y2
+                      + (ulong)x6 * y3
+                      + (ulong)x5 * y4
+                      + (ulong)x4 * y5
+                      + (ulong)x3 * y6
+                      + (ulong)x2 * y7;
+            ulong g1  = (ulong)u1 * v0
+                      + (ulong)u0 * v1;
+            ulong g9  = (ulong)u7 * v2
+                      + (ulong)u6 * v3
+                      + (ulong)u5 * v4
+                      + (ulong)u4 * v5
+                      + (ulong)u3 * v6
+                      + (ulong)u2 * v7;
+            ulong h1  = (ulong)s1 * t0
+                      + (ulong)s0 * t1;
+            ulong h9  = (ulong)s7 * t2
+                      + (ulong)s6 * t3
+                      + (ulong)s5 * t4
+                      + (ulong)s4 * t5
+                      + (ulong)s3 * t6
+                      + (ulong)s2 * t7;
+
+            c        += f1 + g1 + h9 - f9;
+            z1        = (uint)c & M28; c >>= 28;
+            d        += g9 + h1 - f1 + h9;
+            z9        = (uint)d & M28; d >>= 28;
+
+            ulong f2  = (ulong)x2 * y0
+                      + (ulong)x1 * y1
+                      + (ulong)x0 * y2;
+            ulong f10 = (ulong)x7 * y3
+                      + (ulong)x6 * y4
+                      + (ulong)x5 * y5
+                      + (ulong)x4 * y6
+                      + (ulong)x3 * y7;
+            ulong g2  = (ulong)u2 * v0
+                      + (ulong)u1 * v1
+                      + (ulong)u0 * v2;
+            ulong g10 = (ulong)u7 * v3
+                      + (ulong)u6 * v4
+                      + (ulong)u5 * v5
+                      + (ulong)u4 * v6
+                      + (ulong)u3 * v7;
+            ulong h2  = (ulong)s2 * t0
+                      + (ulong)s1 * t1
+                      + (ulong)s0 * t2;
+            ulong h10 = (ulong)s7 * t3
+                      + (ulong)s6 * t4
+                      + (ulong)s5 * t5
+                      + (ulong)s4 * t6
+                      + (ulong)s3 * t7;
+
+            c        += f2 + g2 + h10 - f10;
+            z2        = (uint)c & M28; c >>= 28;
+            d        += g10 + h2 - f2 + h10;
+            z10       = (uint)d & M28; d >>= 28;
+
+            ulong f3  = (ulong)x3 * y0
+                      + (ulong)x2 * y1
+                      + (ulong)x1 * y2
+                      + (ulong)x0 * y3;
+            ulong f11 = (ulong)x7 * y4
+                      + (ulong)x6 * y5
+                      + (ulong)x5 * y6
+                      + (ulong)x4 * y7;
+            ulong g3  = (ulong)u3 * v0
+                      + (ulong)u2 * v1
+                      + (ulong)u1 * v2
+                      + (ulong)u0 * v3;
+            ulong g11 = (ulong)u7 * v4
+                      + (ulong)u6 * v5
+                      + (ulong)u5 * v6
+                      + (ulong)u4 * v7;
+            ulong h3  = (ulong)s3 * t0
+                      + (ulong)s2 * t1
+                      + (ulong)s1 * t2
+                      + (ulong)s0 * t3;
+            ulong h11 = (ulong)s7 * t4
+                      + (ulong)s6 * t5
+                      + (ulong)s5 * t6
+                      + (ulong)s4 * t7;
+
+            c        += f3 + g3 + h11 - f11;
+            z3        = (uint)c & M28; c >>= 28;
+            d        += g11 + h3 - f3 + h11;
+            z11       = (uint)d & M28; d >>= 28;
+
+            ulong f4  = (ulong)x4 * y0
+                      + (ulong)x3 * y1
+                      + (ulong)x2 * y2
+                      + (ulong)x1 * y3
+                      + (ulong)x0 * y4;
+            ulong f12 = (ulong)x7 * y5
+                      + (ulong)x6 * y6
+                      + (ulong)x5 * y7;
+            ulong g4  = (ulong)u4 * v0
+                      + (ulong)u3 * v1
+                      + (ulong)u2 * v2
+                      + (ulong)u1 * v3
+                      + (ulong)u0 * v4;
+            ulong g12 = (ulong)u7 * v5
+                      + (ulong)u6 * v6
+                      + (ulong)u5 * v7;
+            ulong h4  = (ulong)s4 * t0
+                      + (ulong)s3 * t1
+                      + (ulong)s2 * t2
+                      + (ulong)s1 * t3
+                      + (ulong)s0 * t4;
+            ulong h12 = (ulong)s7 * t5
+                      + (ulong)s6 * t6
+                      + (ulong)s5 * t7;
+
+            c        += f4 + g4 + h12 - f12;
+            z4        = (uint)c & M28; c >>= 28;
+            d        += g12 + h4 - f4 + h12;
+            z12       = (uint)d & M28; d >>= 28;
+
+            ulong f5  = (ulong)x5 * y0
+                      + (ulong)x4 * y1
+                      + (ulong)x3 * y2
+                      + (ulong)x2 * y3
+                      + (ulong)x1 * y4
+                      + (ulong)x0 * y5;
+            ulong f13 = (ulong)x7 * y6
+                      + (ulong)x6 * y7;
+            ulong g5  = (ulong)u5 * v0
+                      + (ulong)u4 * v1
+                      + (ulong)u3 * v2
+                      + (ulong)u2 * v3
+                      + (ulong)u1 * v4
+                      + (ulong)u0 * v5;
+            ulong g13 = (ulong)u7 * v6
+                      + (ulong)u6 * v7;
+            ulong h5  = (ulong)s5 * t0
+                      + (ulong)s4 * t1
+                      + (ulong)s3 * t2
+                      + (ulong)s2 * t3
+                      + (ulong)s1 * t4
+                      + (ulong)s0 * t5;
+            ulong h13 = (ulong)s7 * t6
+                      + (ulong)s6 * t7;
+
+            c        += f5 + g5 + h13 - f13;
+            z5        = (uint)c & M28; c >>= 28;
+            d        += g13 + h5 - f5 + h13;
+            z13       = (uint)d & M28; d >>= 28;
+
+            ulong f6  = (ulong)x6 * y0
+                      + (ulong)x5 * y1
+                      + (ulong)x4 * y2
+                      + (ulong)x3 * y3
+                      + (ulong)x2 * y4
+                      + (ulong)x1 * y5
+                      + (ulong)x0 * y6;
+            ulong f14 = (ulong)x7 * y7;
+            ulong g6  = (ulong)u6 * v0
+                      + (ulong)u5 * v1
+                      + (ulong)u4 * v2
+                      + (ulong)u3 * v3
+                      + (ulong)u2 * v4
+                      + (ulong)u1 * v5
+                      + (ulong)u0 * v6;
+            ulong g14 = (ulong)u7 * v7;
+            ulong h6  = (ulong)s6 * t0
+                      + (ulong)s5 * t1
+                      + (ulong)s4 * t2
+                      + (ulong)s3 * t3
+                      + (ulong)s2 * t4
+                      + (ulong)s1 * t5
+                      + (ulong)s0 * t6;
+            ulong h14 = (ulong)s7 * t7;
+
+            c        += f6 + g6 + h14 - f14;
+            z6        = (uint)c & M28; c >>= 28;
+            d        += g14 + h6 - f6 + h14;
+            z14       = (uint)d & M28; d >>= 28;
+
+            ulong f7  = (ulong)x7 * y0
+                      + (ulong)x6 * y1
+                      + (ulong)x5 * y2
+                      + (ulong)x4 * y3
+                      + (ulong)x3 * y4
+                      + (ulong)x2 * y5
+                      + (ulong)x1 * y6
+                      + (ulong)x0 * y7;
+            ulong g7  = (ulong)u7 * v0
+                      + (ulong)u6 * v1
+                      + (ulong)u5 * v2
+                      + (ulong)u4 * v3
+                      + (ulong)u3 * v4
+                      + (ulong)u2 * v5
+                      + (ulong)u1 * v6
+                      + (ulong)u0 * v7;
+            ulong h7  = (ulong)s7 * t0
+                      + (ulong)s6 * t1
+                      + (ulong)s5 * t2
+                      + (ulong)s4 * t3
+                      + (ulong)s3 * t4
+                      + (ulong)s2 * t5
+                      + (ulong)s1 * t6
+                      + (ulong)s0 * t7;
+
+            c        += f7 + g7;
+            z7        = (uint)c & M28; c >>= 28;
+            d        += h7 - f7;
+            z15       = (uint)d & M28; d >>= 28;
+
+            c        += d;
+
+            c        += z8;
+            z8        = (uint)c & M28; c >>= 28;
+            d        += z0;
+            z0        = (uint)d & M28; d >>= 28;
+            z9       += (uint)c;
+            z1       += (uint)d;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+
+        public static void Normalize(uint[] z)
+        {
+            //int x = (z[15] >> (28 - 1)) & 1;
+            Reduce(z, 1);
+            Reduce(z, -1);
+            Debug.Assert(z[15] >> 28 == 0U);
+        }
+
+        private static void Reduce(uint[] z, int c)
+        {
+            uint z15 = z[15];
+            long t = z15;
+            z15 &= M28;
+            t = (t >> 28) + c;
+            z[8] += (uint)t;
+            for (int i = 0; i < 15; ++i)
+            {
+                t += z[i]; z[i] = (uint)t & M28; t >>= 28;
+            }
+            z[15] = z15 + (uint)t;
+        }
+
+        public static void Sqr(uint[] x, uint[] z)
+        {
+            uint x0 = x[0];
+            uint x1 = x[1];
+            uint x2 = x[2];
+            uint x3 = x[3];
+            uint x4 = x[4];
+            uint x5 = x[5];
+            uint x6 = x[6];
+            uint x7 = x[7];
+
+            uint u0 = x[8];
+            uint u1 = x[9];
+            uint u2 = x[10];
+            uint u3 = x[11];
+            uint u4 = x[12];
+            uint u5 = x[13];
+            uint u6 = x[14];
+            uint u7 = x[15];
+
+            uint x0_2 = x0 * 2;
+            uint x1_2 = x1 * 2;
+            uint x2_2 = x2 * 2;
+            uint x3_2 = x3 * 2;
+            uint x4_2 = x4 * 2;
+            uint x5_2 = x5 * 2;
+            uint x6_2 = x6 * 2;
+
+            uint u0_2 = u0 * 2;
+            uint u1_2 = u1 * 2;
+            uint u2_2 = u2 * 2;
+            uint u3_2 = u3 * 2;
+            uint u4_2 = u4 * 2;
+            uint u5_2 = u5 * 2;
+            uint u6_2 = u6 * 2;
+        
+            uint s0 = x0 + u0;
+            uint s1 = x1 + u1;
+            uint s2 = x2 + u2;
+            uint s3 = x3 + u3;
+            uint s4 = x4 + u4;
+            uint s5 = x5 + u5;
+            uint s6 = x6 + u6;
+            uint s7 = x7 + u7;
+
+            uint s0_2 = s0 * 2;
+            uint s1_2 = s1 * 2;
+            uint s2_2 = s2 * 2;
+            uint s3_2 = s3 * 2;
+            uint s4_2 = s4 * 2;
+            uint s5_2 = s5 * 2;
+            uint s6_2 = s6 * 2;
+
+            uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15;
+            ulong c, d;
+
+            ulong f0  = (ulong)x0 * x0;
+            ulong f8  = (ulong)x7 * x1_2
+                      + (ulong)x6 * x2_2
+                      + (ulong)x5 * x3_2
+                      + (ulong)x4 * x4;
+            ulong g0  = (ulong)u0 * u0;
+            ulong g8  = (ulong)u7 * u1_2
+                      + (ulong)u6 * u2_2
+                      + (ulong)u5 * u3_2
+                      + (ulong)u4 * u4;
+            ulong h0  = (ulong)s0 * s0;
+            ulong h8  = (ulong)s7 * s1_2
+                      + (ulong)s6 * s2_2
+                      + (ulong)s5 * s3_2
+                      + (ulong)s4 * s4;
+
+            c         = f0 + g0 + h8 - f8;
+            z0        = (uint)c & M28; c >>= 28;
+            d         = g8 + h0 - f0 + h8;
+            z8        = (uint)d & M28; d >>= 28;
+
+            ulong f1  = (ulong)x1 * x0_2;
+            ulong f9  = (ulong)x7 * x2_2
+                      + (ulong)x6 * x3_2
+                      + (ulong)x5 * x4_2;
+            ulong g1  = (ulong)u1 * u0_2;
+            ulong g9  = (ulong)u7 * u2_2
+                      + (ulong)u6 * u3_2
+                      + (ulong)u5 * u4_2;
+            ulong h1  = (ulong)s1 * s0_2;
+            ulong h9  = (ulong)s7 * s2_2
+                      + (ulong)s6 * s3_2
+                      + (ulong)s5 * s4_2;
+
+            c        += f1 + g1 + h9 - f9;
+            z1        = (uint)c & M28; c >>= 28;
+            d        += g9 + h1 - f1 + h9;
+            z9        = (uint)d & M28; d >>= 28;
+
+            ulong f2  = (ulong)x2 * x0_2
+                      + (ulong)x1 * x1;
+            ulong f10 = (ulong)x7 * x3_2
+                      + (ulong)x6 * x4_2
+                      + (ulong)x5 * x5;
+            ulong g2  = (ulong)u2 * u0_2
+                      + (ulong)u1 * u1;
+            ulong g10 = (ulong)u7 * u3_2
+                      + (ulong)u6 * u4_2
+                      + (ulong)u5 * u5;
+            ulong h2  = (ulong)s2 * s0_2
+                      + (ulong)s1 * s1;
+            ulong h10 = (ulong)s7 * s3_2
+                      + (ulong)s6 * s4_2
+                      + (ulong)s5 * s5;
+
+            c        += f2 + g2 + h10 - f10;
+            z2        = (uint)c & M28; c >>= 28;
+            d        += g10 + h2 - f2 + h10;
+            z10       = (uint)d & M28; d >>= 28;
+
+            ulong f3  = (ulong)x3 * x0_2
+                      + (ulong)x2 * x1_2;
+            ulong f11 = (ulong)x7 * x4_2
+                      + (ulong)x6 * x5_2;
+            ulong g3  = (ulong)u3 * u0_2
+                      + (ulong)u2 * u1_2;
+            ulong g11 = (ulong)u7 * u4_2
+                      + (ulong)u6 * u5_2;
+            ulong h3  = (ulong)s3 * s0_2
+                      + (ulong)s2 * s1_2;
+            ulong h11 = (ulong)s7 * s4_2
+                      + (ulong)s6 * s5_2;
+
+            c        += f3 + g3 + h11 - f11;
+            z3        = (uint)c & M28; c >>= 28;
+            d        += g11 + h3 - f3 + h11;
+            z11       = (uint)d & M28; d >>= 28;
+
+            ulong f4  = (ulong)x4 * x0_2
+                      + (ulong)x3 * x1_2
+                      + (ulong)x2 * x2;
+            ulong f12 = (ulong)x7 * x5_2
+                      + (ulong)x6 * x6;
+            ulong g4  = (ulong)u4 * u0_2
+                      + (ulong)u3 * u1_2
+                      + (ulong)u2 * u2;
+            ulong g12 = (ulong)u7 * u5_2
+                      + (ulong)u6 * u6;
+            ulong h4  = (ulong)s4 * s0_2
+                      + (ulong)s3 * s1_2
+                      + (ulong)s2 * s2;
+            ulong h12 = (ulong)s7 * s5_2
+                      + (ulong)s6 * s6;
+
+            c        += f4 + g4 + h12 - f12;
+            z4        = (uint)c & M28; c >>= 28;
+            d        += g12 + h4 - f4 + h12;
+            z12       = (uint)d & M28; d >>= 28;
+
+            ulong f5  = (ulong)x5 * x0_2
+                      + (ulong)x4 * x1_2
+                      + (ulong)x3 * x2_2;
+            ulong f13 = (ulong)x7 * x6_2;
+            ulong g5  = (ulong)u5 * u0_2
+                      + (ulong)u4 * u1_2
+                      + (ulong)u3 * u2_2;
+            ulong g13 = (ulong)u7 * u6_2;
+            ulong h5  = (ulong)s5 * s0_2
+                      + (ulong)s4 * s1_2
+                      + (ulong)s3 * s2_2;
+            ulong h13 = (ulong)s7 * s6_2;
+
+            c        += f5 + g5 + h13 - f13;
+            z5        = (uint)c & M28; c >>= 28;
+            d        += g13 + h5 - f5 + h13;
+            z13       = (uint)d & M28; d >>= 28;
+
+            ulong f6  = (ulong)x6 * x0_2
+                      + (ulong)x5 * x1_2
+                      + (ulong)x4 * x2_2
+                      + (ulong)x3 * x3;
+            ulong f14 = (ulong)x7 * x7;
+            ulong g6  = (ulong)u6 * u0_2
+                      + (ulong)u5 * u1_2
+                      + (ulong)u4 * u2_2
+                      + (ulong)u3 * u3;
+            ulong g14 = (ulong)u7 * u7;
+            ulong h6  = (ulong)s6 * s0_2
+                      + (ulong)s5 * s1_2
+                      + (ulong)s4 * s2_2
+                      + (ulong)s3 * s3;
+            ulong h14 = (ulong)s7 * s7;
+
+            c        += f6 + g6 + h14 - f14;
+            z6        = (uint)c & M28; c >>= 28;
+            d        += g14 + h6 - f6 + h14;
+            z14       = (uint)d & M28; d >>= 28;
+
+            ulong f7  = (ulong)x7 * x0_2
+                      + (ulong)x6 * x1_2
+                      + (ulong)x5 * x2_2
+                      + (ulong)x4 * x3_2;
+            ulong g7  = (ulong)u7 * u0_2
+                      + (ulong)u6 * u1_2
+                      + (ulong)u5 * u2_2
+                      + (ulong)u4 * u3_2;
+            ulong h7  = (ulong)s7 * s0_2
+                      + (ulong)s6 * s1_2
+                      + (ulong)s5 * s2_2
+                      + (ulong)s4 * s3_2;
+
+            c        += f7 + g7;
+            z7        = (uint)c & M28; c >>= 28;
+            d        += h7 - f7;
+            z15       = (uint)d & M28; d >>= 28;
+
+            c        += d;
+
+            c        += z8;
+            z8        = (uint)c & M28; c >>= 28;
+            d        += z0;
+            z0        = (uint)d & M28; d >>= 28;
+            z9       += (uint)c;
+            z1       += (uint)d;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+
+        public static void Sqr(uint[] x, int n, uint[] z)
+        {
+            Debug.Assert(n > 0);
+
+            Sqr(x, z);
+
+            while (--n > 0)
+            {
+                Sqr(z, z);
+            }
+        }
+
+        public static void Sub(uint[] x, uint[] y, uint[] z)
+        {
+            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7];
+            uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15];
+            uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5], y6 = y[6], y7 = y[7];
+            uint y8 = y[8], y9 = y[9], y10 = y[10], y11 = y[11], y12 = y[12], y13 = y[13], y14 = y[14], y15 = y[15];
+
+            uint z0 = x0 + 0x1FFFFFFEU - y0;
+            uint z1 = x1 + 0x1FFFFFFEU - y1;
+            uint z2 = x2 + 0x1FFFFFFEU - y2;
+            uint z3 = x3 + 0x1FFFFFFEU - y3;
+            uint z4 = x4 + 0x1FFFFFFEU - y4;
+            uint z5 = x5 + 0x1FFFFFFEU - y5;
+            uint z6 = x6 + 0x1FFFFFFEU - y6;
+            uint z7 = x7 + 0x1FFFFFFEU - y7;
+            uint z8 = x8 + 0x1FFFFFFCU - y8;
+            uint z9 = x9 + 0x1FFFFFFEU - y9;
+            uint z10 = x10 + 0x1FFFFFFEU - y10;
+            uint z11 = x11 + 0x1FFFFFFEU - y11;
+            uint z12 = x12 + 0x1FFFFFFEU - y12;
+            uint z13 = x13 + 0x1FFFFFFEU - y13;
+            uint z14 = x14 + 0x1FFFFFFEU - y14;
+            uint z15 = x15 + 0x1FFFFFFEU - y15;
+
+            z2   += z1 >> 28; z1 &= M28;
+            z6   += z5 >> 28; z5 &= M28;
+            z10  += z9 >> 28; z9 &= M28;
+            z14  += z13 >> 28; z13 &= M28;
+
+            z3   += z2 >> 28; z2 &= M28;
+            z7   += z6 >> 28; z6 &= M28;
+            z11  += z10 >> 28; z10 &= M28;
+            z15  += z14 >> 28; z14 &= M28;
+
+            uint t = z15 >> 28; z15 &= M28;
+            z0   += t;
+            z8   += t;
+
+            z4   += z3 >> 28; z3 &= M28;
+            z8   += z7 >> 28; z7 &= M28;
+            z12  += z11 >> 28; z11 &= M28;
+
+            z1   += z0 >> 28; z0 &= M28;
+            z5   += z4 >> 28; z4 &= M28;
+            z9   += z8 >> 28; z8 &= M28;
+            z13  += z12 >> 28; z12 &= M28;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+    }
+}