summary refs log tree commit diff
path: root/crypto/src/math/ec/rfc7748/X448Field.cs
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/src/math/ec/rfc7748/X448Field.cs')
-rw-r--r--crypto/src/math/ec/rfc7748/X448Field.cs904
1 files changed, 904 insertions, 0 deletions
diff --git a/crypto/src/math/ec/rfc7748/X448Field.cs b/crypto/src/math/ec/rfc7748/X448Field.cs
new file mode 100644
index 000000000..0c44f1eb5
--- /dev/null
+++ b/crypto/src/math/ec/rfc7748/X448Field.cs
@@ -0,0 +1,904 @@
+using System;
+using System.Diagnostics;
+
+namespace Org.BouncyCastle.Math.EC.Rfc7748
+{
+    [CLSCompliantAttribute(false)]
+    public abstract class X448Field
+    {
+        public const int Size = 16;
+
+        private const uint M28 = 0x0FFFFFFFU;
+
+        private X448Field() {}
+
+        public static void Add(uint[] x, uint[] y, uint[] z)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[i] = x[i] + y[i];
+            }
+        }
+
+        //public static void Apm(int[] x, int[] y, int[] zp, int[] zm)
+        //{
+        //    for (int i = 0; i < Size; ++i)
+        //    {
+        //        int xi = x[i], yi = y[i];
+        //        zp[i] = xi + yi;
+        //        zm[i] = xi - yi;
+        //    }
+        //}
+
+        public static void Carry(uint[] z)
+        {
+            uint z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3], z4 = z[4], z5 = z[5], z6 = z[6], z7 = z[7];
+            uint z8 = z[8], z9 = z[9], z10 = z[10], z11 = z[11], z12 = z[12], z13 = z[13], z14 = z[14], z15 = z[15];
+
+            z2   += (z1 >> 28); z1 &= M28;
+            z6   += (z5 >> 28); z5 &= M28;
+            z10  += (z9 >> 28); z9 &= M28;
+            z14  += (z13 >> 28); z13 &= M28;
+
+            z3   += (z2 >> 28); z2 &= M28;
+            z7   += (z6 >> 28); z6 &= M28;
+            z11  += (z10 >> 28); z10 &= M28;
+            z15  += (z14 >> 28); z14 &= M28;
+
+            uint t = z15 >> 28; z15 &= M28;
+            z0   += t;
+            z8   += t;
+
+            z4   += (z3 >> 28); z3 &= M28;
+            z8   += (z7 >> 28); z7 &= M28;
+            z12  += (z11 >> 28); z11 &= M28;
+
+            z1   += (z0 >> 28); z0 &= M28;
+            z5   += (z4 >> 28); z4 &= M28;
+            z9   += (z8 >> 28); z8 &= M28;
+            z13  += (z12 >> 28); z12 &= M28;
+
+            z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5; z[6] = z6; z[7] = z7;
+            z[8] = z8; z[9] = z9; z[10] = z10; z[11] = z11; z[12] = z12; z[13] = z13; z[14] = z14; z[15] = z15;
+        }
+
+        public static void Copy(uint[] x, int xOff, uint[] z, int zOff)
+        {
+            for (int i = 0; i < Size; ++i)
+            {
+                z[zOff + i] = x[xOff + i];
+            }
+        }
+
+        public static uint[] Create()
+        {
+            return new uint[Size];
+        }
+
+        public static void CSwap(int swap, uint[] a, uint[] b)
+        {
+            Debug.Assert(swap >> 1 == 0);
+            Debug.Assert(a != b);
+
+            uint mask = (uint)(0 - swap);
+            for (int i = 0; i < Size; ++i)
+            {
+                uint ai = a[i], bi = b[i];
+                uint dummy = mask & (ai ^ bi);
+                a[i] = ai ^ dummy; 
+                b[i] = bi ^ dummy; 
+            }
+        }
+
+        public static void Decode(byte[] x, int xOff, uint[] z)
+        {
+            Decode56(x, xOff, z, 0);
+            Decode56(x, xOff + 7, z, 2);
+            Decode56(x, xOff + 14, z, 4);
+            Decode56(x, xOff + 21, z, 6);
+            Decode56(x, xOff + 28, z, 8);
+            Decode56(x, xOff + 35, z, 10);
+            Decode56(x, xOff + 42, z, 12);
+            Decode56(x, xOff + 49, z, 14);
+        }
+
+        private static uint Decode24(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            return n;
+        }
+
+        private static uint Decode32(byte[] bs, int off)
+        {
+            uint n = bs[off];
+            n |= (uint)bs[++off] << 8;
+            n |= (uint)bs[++off] << 16;
+            n |= (uint)bs[++off] << 24;
+            return n;
+        }
+
+        private static void Decode56(byte[] bs, int off, uint[] z, int zOff)
+        {
+            uint lo = Decode32(bs, off);
+            uint hi = Decode24(bs, off + 4);
+            z[zOff] = lo & M28;
+            z[zOff + 1] = (lo >> 28) | (hi << 4);
+        }
+
+        public static void Encode(uint[] x, byte[] z, int zOff)
+        {
+            Encode56(x, 0, z, zOff);
+            Encode56(x, 2, z, zOff + 7);
+            Encode56(x, 4, z, zOff + 14);
+            Encode56(x, 6, z, zOff + 21);
+            Encode56(x, 8, z, zOff + 28);
+            Encode56(x, 10, z, zOff + 35);
+            Encode56(x, 12, z, zOff + 42);
+            Encode56(x, 14, z, zOff + 49);
+        }
+
+        private static void Encode24(uint n, byte[] bs, int off)
+        {
+            bs[  off] = (byte)(n      );
+            bs[++off] = (byte)(n >>  8);
+            bs[++off] = (byte)(n >> 16);
+        }
+
+        private static void Encode32(uint n, byte[] bs, int off)
+        {
+            bs[  off] = (byte)(n      );
+            bs[++off] = (byte)(n >>  8);
+            bs[++off] = (byte)(n >> 16);
+            bs[++off] = (byte)(n >> 24);
+        }
+
+        private static void Encode56(uint[] x, int xOff, byte[] bs, int off)
+        {
+            uint lo = x[xOff], hi = x[xOff + 1];
+            Encode32(lo | (hi << 28), bs, off);
+            Encode24(hi >> 4, bs, off + 4);
+        }
+
+        public static void Inv(uint[] x, uint[] z)
+        {
+            // z = x^(p-2) = x^(2^448 - 2^224 - 3)
+            // (223 1s) (1 0s) (222 1s) (1 0s) (1 1s)
+            // Addition chain: [1] 2 3 6 9 18 19 37 74 111 [222] [223]
+            uint[] x2 = Create();    Sqr(x, x2);             Mul(x, x2, x2);
+            uint[] x3 = Create();    Sqr(x2, x3);            Mul(x, x3, x3);
+            uint[] x6 = Create();    Sqr(x3, 3, x6);         Mul(x3, x6, x6);
+            uint[] x9 = Create();    Sqr(x6, 3, x9);         Mul(x3, x9, x9);
+            uint[] x18 = Create();   Sqr(x9, 9, x18);        Mul(x9, x18, x18);
+            uint[] x19 = Create();   Sqr(x18, x19);          Mul(x, x19, x19);
+            uint[] x37 = Create();   Sqr(x19, 18, x37);      Mul(x18, x37, x37);
+            uint[] x74 = Create();   Sqr(x37, 37, x74);      Mul(x37, x74, x74);
+            uint[] x111 = Create();  Sqr(x74, 37, x111);     Mul(x37, x111, x111);
+            uint[] x222 = Create();  Sqr(x111, 111, x222);   Mul(x111, x222, x222);
+            uint[] x223 = Create();  Sqr(x222, x223);        Mul(x, x223, x223);
+
+            uint[] t = Create();
+            Sqr(x223, 223, t);
+            Mul(t, x222, t);
+            Sqr(t, 2, t);
+            Mul(t, x, z);
+        }
+
+        public static void Mul(uint[] x, uint y, uint[] z)
+        {
+            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7];
+            uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15];
+
+            uint z1, z5, z9, z13;
+            ulong c, d, e, f;
+
+            c     = (ulong)x1 * y;
+            z1    = (uint)c & M28; c >>= 28;
+            d     = (ulong)x5 * y;
+            z5    = (uint)d & M28; d >>= 28;
+            e     = (ulong)x9 * y;
+            z9    = (uint)e & M28; e >>= 28;
+            f     = (ulong)x13 * y;
+            z13   = (uint)f & M28; f >>= 28;
+
+            c    += (ulong)x2 * y;
+            z[2]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x6 * y;
+            z[6]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x10 * y;
+            z[10] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x14 * y;
+            z[14] = (uint)f & M28; f >>= 28;
+
+            c    += (ulong)x3 * y;
+            z[3]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x7 * y;
+            z[7]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x11 * y;
+            z[11] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x15 * y;
+            z[15] = (uint)f & M28; f >>= 28;
+
+            d    += f;
+
+            c    += (ulong)x4 * y;
+            z[4]  = (uint)c & M28; c >>= 28;
+            d    += (ulong)x8 * y;
+            z[8]  = (uint)d & M28; d >>= 28;
+            e    += (ulong)x12 * y;
+            z[12] = (uint)e & M28; e >>= 28;
+            f    += (ulong)x0 * y;
+            z[0]  = (uint)f & M28; f >>= 28;
+
+            z[1]  = z1 + (uint)f;
+            z[5]  = z5 + (uint)c;
+            z[9]  = z9 + (uint)d;
+            z[13] = z13 + (uint)e;
+        }
+
+        public static void Mul(uint[] x, uint[] y, uint[] z)
+        {
+            uint x0 = x[0];
+            uint x1 = x[1];
+            uint x2 = x[2];
+            uint x3 = x[3];
+            uint x4 = x[4];
+            uint x5 = x[5];
+            uint x6 = x[6];
+            uint x7 = x[7];
+
+            uint u0 = x[8];
+            uint u1 = x[9];
+            uint u2 = x[10];
+            uint u3 = x[11];
+            uint u4 = x[12];
+            uint u5 = x[13];
+            uint u6 = x[14];
+            uint u7 = x[15];
+
+            uint y0 = y[0];
+            uint y1 = y[1];
+            uint y2 = y[2];
+            uint y3 = y[3];
+            uint y4 = y[4];
+            uint y5 = y[5];
+            uint y6 = y[6];
+            uint y7 = y[7];
+
+            uint v0 = y[8];
+            uint v1 = y[9];
+            uint v2 = y[10];
+            uint v3 = y[11];
+            uint v4 = y[12];
+            uint v5 = y[13];
+            uint v6 = y[14];
+            uint v7 = y[15];
+
+            uint s0 = x0 + u0;
+            uint s1 = x1 + u1;
+            uint s2 = x2 + u2;
+            uint s3 = x3 + u3;
+            uint s4 = x4 + u4;
+            uint s5 = x5 + u5;
+            uint s6 = x6 + u6;
+            uint s7 = x7 + u7;
+
+            uint t0 = y0 + v0;
+            uint t1 = y1 + v1;
+            uint t2 = y2 + v2;
+            uint t3 = y3 + v3;
+            uint t4 = y4 + v4;
+            uint t5 = y5 + v5;
+            uint t6 = y6 + v6;
+            uint t7 = y7 + v7;
+
+            uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15;
+            ulong c, d;
+
+            ulong f0  = (ulong)x0 * y0;
+            ulong f8  = (ulong)x7 * y1
+                      + (ulong)x6 * y2
+                      + (ulong)x5 * y3
+                      + (ulong)x4 * y4
+                      + (ulong)x3 * y5
+                      + (ulong)x2 * y6
+                      + (ulong)x1 * y7;
+            ulong g0  = (ulong)u0 * v0;
+            ulong g8  = (ulong)u7 * v1
+                      + (ulong)u6 * v2
+                      + (ulong)u5 * v3
+                      + (ulong)u4 * v4
+                      + (ulong)u3 * v5
+                      + (ulong)u2 * v6
+                      + (ulong)u1 * v7;
+            ulong h0  = (ulong)s0 * t0;
+            ulong h8  = (ulong)s7 * t1
+                      + (ulong)s6 * t2
+                      + (ulong)s5 * t3
+                      + (ulong)s4 * t4
+                      + (ulong)s3 * t5
+                      + (ulong)s2 * t6
+                      + (ulong)s1 * t7;
+
+            c         = f0 + g0 + h8 - f8;
+            z0        = (uint)c & M28; c >>= 28;
+            d         = g8 + h0 - f0 + h8;
+            z8        = (uint)d & M28; d >>= 28;
+
+            ulong f1  = (ulong)x1 * y0
+                      + (ulong)x0 * y1;
+            ulong f9  = (ulong)x7 * y2
+                      + (ulong)x6 * y3
+                      + (ulong)x5 * y4
+                      + (ulong)x4 * y5
+                      + (ulong)x3 * y6
+                      + (ulong)x2 * y7;
+            ulong g1  = (ulong)u1 * v0
+                      + (ulong)u0 * v1;
+            ulong g9  = (ulong)u7 * v2
+                      + (ulong)u6 * v3
+                      + (ulong)u5 * v4
+                      + (ulong)u4 * v5
+                      + (ulong)u3 * v6
+                      + (ulong)u2 * v7;
+            ulong h1  = (ulong)s1 * t0
+                      + (ulong)s0 * t1;
+            ulong h9  = (ulong)s7 * t2
+                      + (ulong)s6 * t3
+                      + (ulong)s5 * t4
+                      + (ulong)s4 * t5
+                      + (ulong)s3 * t6
+                      + (ulong)s2 * t7;
+
+            c        += f1 + g1 + h9 - f9;
+            z1        = (uint)c & M28; c >>= 28;
+            d        += g9 + h1 - f1 + h9;
+            z9        = (uint)d & M28; d >>= 28;
+
+            ulong f2  = (ulong)x2 * y0
+                      + (ulong)x1 * y1
+                      + (ulong)x0 * y2;
+            ulong f10 = (ulong)x7 * y3
+                      + (ulong)x6 * y4
+                      + (ulong)x5 * y5
+                      + (ulong)x4 * y6
+                      + (ulong)x3 * y7;
+            ulong g2  = (ulong)u2 * v0
+                      + (ulong)u1 * v1
+                      + (ulong)u0 * v2;
+            ulong g10 = (ulong)u7 * v3
+                      + (ulong)u6 * v4
+                      + (ulong)u5 * v5
+                      + (ulong)u4 * v6
+                      + (ulong)u3 * v7;
+            ulong h2  = (ulong)s2 * t0
+                      + (ulong)s1 * t1
+                      + (ulong)s0 * t2;
+            ulong h10 = (ulong)s7 * t3
+                      + (ulong)s6 * t4
+                      + (ulong)s5 * t5
+                      + (ulong)s4 * t6
+                      + (ulong)s3 * t7;
+
+            c        += f2 + g2 + h10 - f10;
+            z2        = (uint)c & M28; c >>= 28;
+            d        += g10 + h2 - f2 + h10;
+            z10       = (uint)d & M28; d >>= 28;
+
+            ulong f3  = (ulong)x3 * y0
+                      + (ulong)x2 * y1
+                      + (ulong)x1 * y2
+                      + (ulong)x0 * y3;
+            ulong f11 = (ulong)x7 * y4
+                      + (ulong)x6 * y5
+                      + (ulong)x5 * y6
+                      + (ulong)x4 * y7;
+            ulong g3  = (ulong)u3 * v0
+                      + (ulong)u2 * v1
+                      + (ulong)u1 * v2
+                      + (ulong)u0 * v3;
+            ulong g11 = (ulong)u7 * v4
+                      + (ulong)u6 * v5
+                      + (ulong)u5 * v6
+                      + (ulong)u4 * v7;
+            ulong h3  = (ulong)s3 * t0
+                      + (ulong)s2 * t1
+                      + (ulong)s1 * t2
+                      + (ulong)s0 * t3;
+            ulong h11 = (ulong)s7 * t4
+                      + (ulong)s6 * t5
+                      + (ulong)s5 * t6
+                      + (ulong)s4 * t7;
+
+            c        += f3 + g3 + h11 - f11;
+            z3        = (uint)c & M28; c >>= 28;
+            d        += g11 + h3 - f3 + h11;
+            z11       = (uint)d & M28; d >>= 28;
+
+            ulong f4  = (ulong)x4 * y0
+                      + (ulong)x3 * y1
+                      + (ulong)x2 * y2
+                      + (ulong)x1 * y3
+                      + (ulong)x0 * y4;
+            ulong f12 = (ulong)x7 * y5
+                      + (ulong)x6 * y6
+                      + (ulong)x5 * y7;
+            ulong g4  = (ulong)u4 * v0
+                      + (ulong)u3 * v1
+                      + (ulong)u2 * v2
+                      + (ulong)u1 * v3
+                      + (ulong)u0 * v4;
+            ulong g12 = (ulong)u7 * v5
+                      + (ulong)u6 * v6
+                      + (ulong)u5 * v7;
+            ulong h4  = (ulong)s4 * t0
+                      + (ulong)s3 * t1
+                      + (ulong)s2 * t2
+                      + (ulong)s1 * t3
+                      + (ulong)s0 * t4;
+            ulong h12 = (ulong)s7 * t5
+                      + (ulong)s6 * t6
+                      + (ulong)s5 * t7;
+
+            c        += f4 + g4 + h12 - f12;
+            z4        = (uint)c & M28; c >>= 28;
+            d        += g12 + h4 - f4 + h12;
+            z12       = (uint)d & M28; d >>= 28;
+
+            ulong f5  = (ulong)x5 * y0
+                      + (ulong)x4 * y1
+                      + (ulong)x3 * y2
+                      + (ulong)x2 * y3
+                      + (ulong)x1 * y4
+                      + (ulong)x0 * y5;
+            ulong f13 = (ulong)x7 * y6
+                      + (ulong)x6 * y7;
+            ulong g5  = (ulong)u5 * v0
+                      + (ulong)u4 * v1
+                      + (ulong)u3 * v2
+                      + (ulong)u2 * v3
+                      + (ulong)u1 * v4
+                      + (ulong)u0 * v5;
+            ulong g13 = (ulong)u7 * v6
+                      + (ulong)u6 * v7;
+            ulong h5  = (ulong)s5 * t0
+                      + (ulong)s4 * t1
+                      + (ulong)s3 * t2
+                      + (ulong)s2 * t3
+                      + (ulong)s1 * t4
+                      + (ulong)s0 * t5;
+            ulong h13 = (ulong)s7 * t6
+                      + (ulong)s6 * t7;
+
+            c        += f5 + g5 + h13 - f13;
+            z5        = (uint)c & M28; c >>= 28;
+            d        += g13 + h5 - f5 + h13;
+            z13       = (uint)d & M28; d >>= 28;
+
+            ulong f6  = (ulong)x6 * y0
+                      + (ulong)x5 * y1
+                      + (ulong)x4 * y2
+                      + (ulong)x3 * y3
+                      + (ulong)x2 * y4
+                      + (ulong)x1 * y5
+                      + (ulong)x0 * y6;
+            ulong f14 = (ulong)x7 * y7;
+            ulong g6  = (ulong)u6 * v0
+                      + (ulong)u5 * v1
+                      + (ulong)u4 * v2
+                      + (ulong)u3 * v3
+                      + (ulong)u2 * v4
+                      + (ulong)u1 * v5
+                      + (ulong)u0 * v6;
+            ulong g14 = (ulong)u7 * v7;
+            ulong h6  = (ulong)s6 * t0
+                      + (ulong)s5 * t1
+                      + (ulong)s4 * t2
+                      + (ulong)s3 * t3
+                      + (ulong)s2 * t4
+                      + (ulong)s1 * t5
+                      + (ulong)s0 * t6;
+            ulong h14 = (ulong)s7 * t7;
+
+            c        += f6 + g6 + h14 - f14;
+            z6        = (uint)c & M28; c >>= 28;
+            d        += g14 + h6 - f6 + h14;
+            z14       = (uint)d & M28; d >>= 28;
+
+            ulong f7  = (ulong)x7 * y0
+                      + (ulong)x6 * y1
+                      + (ulong)x5 * y2
+                      + (ulong)x4 * y3
+                      + (ulong)x3 * y4
+                      + (ulong)x2 * y5
+                      + (ulong)x1 * y6
+                      + (ulong)x0 * y7;
+            ulong g7  = (ulong)u7 * v0
+                      + (ulong)u6 * v1
+                      + (ulong)u5 * v2
+                      + (ulong)u4 * v3
+                      + (ulong)u3 * v4
+                      + (ulong)u2 * v5
+                      + (ulong)u1 * v6
+                      + (ulong)u0 * v7;
+            ulong h7  = (ulong)s7 * t0
+                      + (ulong)s6 * t1
+                      + (ulong)s5 * t2
+                      + (ulong)s4 * t3
+                      + (ulong)s3 * t4
+                      + (ulong)s2 * t5
+                      + (ulong)s1 * t6
+                      + (ulong)s0 * t7;
+
+            c        += f7 + g7;
+            z7        = (uint)c & M28; c >>= 28;
+            d        += h7 - f7;
+            z15       = (uint)d & M28; d >>= 28;
+
+            c        += d;
+
+            c        += z8;
+            z8        = (uint)c & M28; c >>= 28;
+            d        += z0;
+            z0        = (uint)d & M28; d >>= 28;
+            z9       += (uint)c;
+            z1       += (uint)d;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+
+        public static void Normalize(uint[] z)
+        {
+            //int x = (z[15] >> (28 - 1)) & 1;
+            Reduce(z, 1);
+            Reduce(z, -1);
+            Debug.Assert(z[15] >> 28 == 0U);
+        }
+
+        private static void Reduce(uint[] z, int c)
+        {
+            uint z15 = z[15];
+            long t = z15;
+            z15 &= M28;
+            t = (t >> 28) + c;
+            z[8] += (uint)t;
+            for (int i = 0; i < 15; ++i)
+            {
+                t += z[i]; z[i] = (uint)t & M28; t >>= 28;
+            }
+            z[15] = z15 + (uint)t;
+        }
+
+        public static void Sqr(uint[] x, uint[] z)
+        {
+            uint x0 = x[0];
+            uint x1 = x[1];
+            uint x2 = x[2];
+            uint x3 = x[3];
+            uint x4 = x[4];
+            uint x5 = x[5];
+            uint x6 = x[6];
+            uint x7 = x[7];
+
+            uint u0 = x[8];
+            uint u1 = x[9];
+            uint u2 = x[10];
+            uint u3 = x[11];
+            uint u4 = x[12];
+            uint u5 = x[13];
+            uint u6 = x[14];
+            uint u7 = x[15];
+
+            uint x0_2 = x0 * 2;
+            uint x1_2 = x1 * 2;
+            uint x2_2 = x2 * 2;
+            uint x3_2 = x3 * 2;
+            uint x4_2 = x4 * 2;
+            uint x5_2 = x5 * 2;
+            uint x6_2 = x6 * 2;
+
+            uint u0_2 = u0 * 2;
+            uint u1_2 = u1 * 2;
+            uint u2_2 = u2 * 2;
+            uint u3_2 = u3 * 2;
+            uint u4_2 = u4 * 2;
+            uint u5_2 = u5 * 2;
+            uint u6_2 = u6 * 2;
+        
+            uint s0 = x0 + u0;
+            uint s1 = x1 + u1;
+            uint s2 = x2 + u2;
+            uint s3 = x3 + u3;
+            uint s4 = x4 + u4;
+            uint s5 = x5 + u5;
+            uint s6 = x6 + u6;
+            uint s7 = x7 + u7;
+
+            uint s0_2 = s0 * 2;
+            uint s1_2 = s1 * 2;
+            uint s2_2 = s2 * 2;
+            uint s3_2 = s3 * 2;
+            uint s4_2 = s4 * 2;
+            uint s5_2 = s5 * 2;
+            uint s6_2 = s6 * 2;
+
+            uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15;
+            ulong c, d;
+
+            ulong f0  = (ulong)x0 * x0;
+            ulong f8  = (ulong)x7 * x1_2
+                      + (ulong)x6 * x2_2
+                      + (ulong)x5 * x3_2
+                      + (ulong)x4 * x4;
+            ulong g0  = (ulong)u0 * u0;
+            ulong g8  = (ulong)u7 * u1_2
+                      + (ulong)u6 * u2_2
+                      + (ulong)u5 * u3_2
+                      + (ulong)u4 * u4;
+            ulong h0  = (ulong)s0 * s0;
+            ulong h8  = (ulong)s7 * s1_2
+                      + (ulong)s6 * s2_2
+                      + (ulong)s5 * s3_2
+                      + (ulong)s4 * s4;
+
+            c         = f0 + g0 + h8 - f8;
+            z0        = (uint)c & M28; c >>= 28;
+            d         = g8 + h0 - f0 + h8;
+            z8        = (uint)d & M28; d >>= 28;
+
+            ulong f1  = (ulong)x1 * x0_2;
+            ulong f9  = (ulong)x7 * x2_2
+                      + (ulong)x6 * x3_2
+                      + (ulong)x5 * x4_2;
+            ulong g1  = (ulong)u1 * u0_2;
+            ulong g9  = (ulong)u7 * u2_2
+                      + (ulong)u6 * u3_2
+                      + (ulong)u5 * u4_2;
+            ulong h1  = (ulong)s1 * s0_2;
+            ulong h9  = (ulong)s7 * s2_2
+                      + (ulong)s6 * s3_2
+                      + (ulong)s5 * s4_2;
+
+            c        += f1 + g1 + h9 - f9;
+            z1        = (uint)c & M28; c >>= 28;
+            d        += g9 + h1 - f1 + h9;
+            z9        = (uint)d & M28; d >>= 28;
+
+            ulong f2  = (ulong)x2 * x0_2
+                      + (ulong)x1 * x1;
+            ulong f10 = (ulong)x7 * x3_2
+                      + (ulong)x6 * x4_2
+                      + (ulong)x5 * x5;
+            ulong g2  = (ulong)u2 * u0_2
+                      + (ulong)u1 * u1;
+            ulong g10 = (ulong)u7 * u3_2
+                      + (ulong)u6 * u4_2
+                      + (ulong)u5 * u5;
+            ulong h2  = (ulong)s2 * s0_2
+                      + (ulong)s1 * s1;
+            ulong h10 = (ulong)s7 * s3_2
+                      + (ulong)s6 * s4_2
+                      + (ulong)s5 * s5;
+
+            c        += f2 + g2 + h10 - f10;
+            z2        = (uint)c & M28; c >>= 28;
+            d        += g10 + h2 - f2 + h10;
+            z10       = (uint)d & M28; d >>= 28;
+
+            ulong f3  = (ulong)x3 * x0_2
+                      + (ulong)x2 * x1_2;
+            ulong f11 = (ulong)x7 * x4_2
+                      + (ulong)x6 * x5_2;
+            ulong g3  = (ulong)u3 * u0_2
+                      + (ulong)u2 * u1_2;
+            ulong g11 = (ulong)u7 * u4_2
+                      + (ulong)u6 * u5_2;
+            ulong h3  = (ulong)s3 * s0_2
+                      + (ulong)s2 * s1_2;
+            ulong h11 = (ulong)s7 * s4_2
+                      + (ulong)s6 * s5_2;
+
+            c        += f3 + g3 + h11 - f11;
+            z3        = (uint)c & M28; c >>= 28;
+            d        += g11 + h3 - f3 + h11;
+            z11       = (uint)d & M28; d >>= 28;
+
+            ulong f4  = (ulong)x4 * x0_2
+                      + (ulong)x3 * x1_2
+                      + (ulong)x2 * x2;
+            ulong f12 = (ulong)x7 * x5_2
+                      + (ulong)x6 * x6;
+            ulong g4  = (ulong)u4 * u0_2
+                      + (ulong)u3 * u1_2
+                      + (ulong)u2 * u2;
+            ulong g12 = (ulong)u7 * u5_2
+                      + (ulong)u6 * u6;
+            ulong h4  = (ulong)s4 * s0_2
+                      + (ulong)s3 * s1_2
+                      + (ulong)s2 * s2;
+            ulong h12 = (ulong)s7 * s5_2
+                      + (ulong)s6 * s6;
+
+            c        += f4 + g4 + h12 - f12;
+            z4        = (uint)c & M28; c >>= 28;
+            d        += g12 + h4 - f4 + h12;
+            z12       = (uint)d & M28; d >>= 28;
+
+            ulong f5  = (ulong)x5 * x0_2
+                      + (ulong)x4 * x1_2
+                      + (ulong)x3 * x2_2;
+            ulong f13 = (ulong)x7 * x6_2;
+            ulong g5  = (ulong)u5 * u0_2
+                      + (ulong)u4 * u1_2
+                      + (ulong)u3 * u2_2;
+            ulong g13 = (ulong)u7 * u6_2;
+            ulong h5  = (ulong)s5 * s0_2
+                      + (ulong)s4 * s1_2
+                      + (ulong)s3 * s2_2;
+            ulong h13 = (ulong)s7 * s6_2;
+
+            c        += f5 + g5 + h13 - f13;
+            z5        = (uint)c & M28; c >>= 28;
+            d        += g13 + h5 - f5 + h13;
+            z13       = (uint)d & M28; d >>= 28;
+
+            ulong f6  = (ulong)x6 * x0_2
+                      + (ulong)x5 * x1_2
+                      + (ulong)x4 * x2_2
+                      + (ulong)x3 * x3;
+            ulong f14 = (ulong)x7 * x7;
+            ulong g6  = (ulong)u6 * u0_2
+                      + (ulong)u5 * u1_2
+                      + (ulong)u4 * u2_2
+                      + (ulong)u3 * u3;
+            ulong g14 = (ulong)u7 * u7;
+            ulong h6  = (ulong)s6 * s0_2
+                      + (ulong)s5 * s1_2
+                      + (ulong)s4 * s2_2
+                      + (ulong)s3 * s3;
+            ulong h14 = (ulong)s7 * s7;
+
+            c        += f6 + g6 + h14 - f14;
+            z6        = (uint)c & M28; c >>= 28;
+            d        += g14 + h6 - f6 + h14;
+            z14       = (uint)d & M28; d >>= 28;
+
+            ulong f7  = (ulong)x7 * x0_2
+                      + (ulong)x6 * x1_2
+                      + (ulong)x5 * x2_2
+                      + (ulong)x4 * x3_2;
+            ulong g7  = (ulong)u7 * u0_2
+                      + (ulong)u6 * u1_2
+                      + (ulong)u5 * u2_2
+                      + (ulong)u4 * u3_2;
+            ulong h7  = (ulong)s7 * s0_2
+                      + (ulong)s6 * s1_2
+                      + (ulong)s5 * s2_2
+                      + (ulong)s4 * s3_2;
+
+            c        += f7 + g7;
+            z7        = (uint)c & M28; c >>= 28;
+            d        += h7 - f7;
+            z15       = (uint)d & M28; d >>= 28;
+
+            c        += d;
+
+            c        += z8;
+            z8        = (uint)c & M28; c >>= 28;
+            d        += z0;
+            z0        = (uint)d & M28; d >>= 28;
+            z9       += (uint)c;
+            z1       += (uint)d;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+
+        public static void Sqr(uint[] x, int n, uint[] z)
+        {
+            Debug.Assert(n > 0);
+
+            Sqr(x, z);
+
+            while (--n > 0)
+            {
+                Sqr(z, z);
+            }
+        }
+
+        public static void Sub(uint[] x, uint[] y, uint[] z)
+        {
+            uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7];
+            uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15];
+            uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5], y6 = y[6], y7 = y[7];
+            uint y8 = y[8], y9 = y[9], y10 = y[10], y11 = y[11], y12 = y[12], y13 = y[13], y14 = y[14], y15 = y[15];
+
+            uint z0 = x0 + 0x1FFFFFFEU - y0;
+            uint z1 = x1 + 0x1FFFFFFEU - y1;
+            uint z2 = x2 + 0x1FFFFFFEU - y2;
+            uint z3 = x3 + 0x1FFFFFFEU - y3;
+            uint z4 = x4 + 0x1FFFFFFEU - y4;
+            uint z5 = x5 + 0x1FFFFFFEU - y5;
+            uint z6 = x6 + 0x1FFFFFFEU - y6;
+            uint z7 = x7 + 0x1FFFFFFEU - y7;
+            uint z8 = x8 + 0x1FFFFFFCU - y8;
+            uint z9 = x9 + 0x1FFFFFFEU - y9;
+            uint z10 = x10 + 0x1FFFFFFEU - y10;
+            uint z11 = x11 + 0x1FFFFFFEU - y11;
+            uint z12 = x12 + 0x1FFFFFFEU - y12;
+            uint z13 = x13 + 0x1FFFFFFEU - y13;
+            uint z14 = x14 + 0x1FFFFFFEU - y14;
+            uint z15 = x15 + 0x1FFFFFFEU - y15;
+
+            z2   += z1 >> 28; z1 &= M28;
+            z6   += z5 >> 28; z5 &= M28;
+            z10  += z9 >> 28; z9 &= M28;
+            z14  += z13 >> 28; z13 &= M28;
+
+            z3   += z2 >> 28; z2 &= M28;
+            z7   += z6 >> 28; z6 &= M28;
+            z11  += z10 >> 28; z10 &= M28;
+            z15  += z14 >> 28; z14 &= M28;
+
+            uint t = z15 >> 28; z15 &= M28;
+            z0   += t;
+            z8   += t;
+
+            z4   += z3 >> 28; z3 &= M28;
+            z8   += z7 >> 28; z7 &= M28;
+            z12  += z11 >> 28; z11 &= M28;
+
+            z1   += z0 >> 28; z0 &= M28;
+            z5   += z4 >> 28; z4 &= M28;
+            z9   += z8 >> 28; z8 &= M28;
+            z13  += z12 >> 28; z12 &= M28;
+
+            z[0] = z0;
+            z[1] = z1;
+            z[2] = z2;
+            z[3] = z3;
+            z[4] = z4;
+            z[5] = z5;
+            z[6] = z6;
+            z[7] = z7;
+            z[8] = z8;
+            z[9] = z9;
+            z[10] = z10;
+            z[11] = z11;
+            z[12] = z12;
+            z[13] = z13;
+            z[14] = z14;
+            z[15] = z15;
+        }
+    }
+}