summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2014-02-03 15:51:53 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2014-02-03 15:51:53 +0700
commit75b6bf6278de7460c61cecaaa773542d92b3249e (patch)
tree9736a9b240dfedcf3c3566b32883bc9c0ec4a2cc
parentMinor optimization for secp521r1 point doubling (diff)
downloadBouncyCastle.NET-ed25519-75b6bf6278de7460c61cecaaa773542d92b3249e.tar.xz
Implement Karatsuba multiply/square on 512-bit numbers and use as basis for P-521 multiply/square
-rw-r--r--crypto/crypto.csproj5
-rw-r--r--crypto/src/math/ec/Nat.cs97
-rw-r--r--crypto/src/math/ec/custom/sec/Nat256.cs456
-rw-r--r--crypto/src/math/ec/custom/sec/Nat512.cs46
-rw-r--r--crypto/src/math/ec/custom/sec/SecP521R1Field.cs41
5 files changed, 582 insertions, 63 deletions
diff --git a/crypto/crypto.csproj b/crypto/crypto.csproj
index 4c125b99a..da99b5e97 100644
--- a/crypto/crypto.csproj
+++ b/crypto/crypto.csproj
@@ -4689,6 +4689,11 @@
                     BuildAction = "Compile"
                 />
                 <File
+                    RelPath = "src\math\ec\custom\sec\Nat512.cs"
+                    SubType = "Code"
+                    BuildAction = "Compile"
+                />
+                <File
                     RelPath = "src\math\ec\custom\sec\SecP192K1Curve.cs"
                     SubType = "Code"
                     BuildAction = "Compile"
diff --git a/crypto/src/math/ec/Nat.cs b/crypto/src/math/ec/Nat.cs
index 819979473..821d5065d 100644
--- a/crypto/src/math/ec/Nat.cs
+++ b/crypto/src/math/ec/Nat.cs
@@ -2,7 +2,6 @@
 using System.Diagnostics;
 
 using Org.BouncyCastle.Crypto.Utilities;
-using Org.BouncyCastle.Math;
 
 namespace Org.BouncyCastle.Math.EC
 {
@@ -32,6 +31,18 @@ namespace Org.BouncyCastle.Math.EC
             return (uint)c;
         }
 
+        public static uint AddBothTo(int len, uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            ulong c = 0;
+            for (int i = 0; i < len; ++i)
+            {
+                c += (ulong)x[xOff + i] + y[yOff + i] + z[zOff + i];
+                z[zOff + i] = (uint)c;
+                c >>= 32;
+            }
+            return (uint)c;
+        }
+
         // TODO Re-write to allow full range for x?
         public static uint AddDWord(int len, ulong x, uint[] z, int zOff)
         {
@@ -46,14 +57,13 @@ namespace Org.BouncyCastle.Math.EC
             return c == 0 ? 0 : Inc(len, z, zOff + 2);
         }
 
-        public static uint AddToExt(int len, uint[] x, int xOff, uint[] zz, int zzOff)
+        public static uint AddTo(int len, uint[] x, int xOff, uint[] z, int zOff)
         {
-            Debug.Assert(zzOff <= len);
             ulong c = 0;
             for (int i = 0; i < len; ++i)
             {
-                c += (ulong)x[xOff + i] + zz[zzOff + i];
-                zz[zzOff + i] = (uint)c;
+                c += (ulong)x[xOff + i] + z[zOff + i];
+                z[zOff + i] = (uint)c;
                 c >>= 32;
             }
             return (uint)c;
@@ -205,23 +215,50 @@ namespace Org.BouncyCastle.Math.EC
 
         public static void Mul(int len, uint[] x, uint[] y, uint[] zz)
         {
-            zz[len] = (uint)MulWordExt(len, x[0], y, zz, 0);
+            zz[len] = (uint)MulWord(len, x[0], y, zz, 0);
 
             for (int i = 1; i < len; ++i)
             {
-                zz[i + len] = (uint)MulWordAddExt(len, x[i], y, zz, i);
+                zz[i + len] = (uint)MulWordAdd(len, x[i], y, zz, i);
+            }
+        }
+
+        public static uint Mul31BothAdd(int len, uint a, uint[] x, uint b, uint[] y, uint[] z, int zOff)
+        {
+            ulong c = 0, aVal = (ulong)a, bVal = (ulong)b;
+            int i = 0;
+            do
+            {
+                c += aVal * x[i] + bVal * y[i] + z[zOff + i];
+                z[zOff + i] = (uint)c;
+                c >>= 32;
+            }
+            while (++i < len);
+            return (uint)c;
+        }
+
+        public static uint MulWord(int len, uint x, uint[] y, uint[] z, int zOff)
+        {
+            ulong c = 0, xVal = (ulong)x;
+            int i = 0;
+            do
+            {
+                c += xVal * y[i];
+                z[zOff + i] = (uint)c;
+                c >>= 32;
             }
+            while (++i < len);
+            return (uint)c;
         }
 
-        public static uint MulWordAddExt(int len, uint x, uint[] y, uint[] zz, int zzOff)
+        public static uint MulWordAdd(int len, uint x, uint[] y, uint[] z, int zOff)
         {
-            Debug.Assert(zzOff <= len);
             ulong c = 0, xVal = (ulong)x;
             int i = 0;
             do
             {
-                c += xVal * y[i] + zz[zzOff + i];
-                zz[zzOff + i] = (uint)c;
+                c += xVal * y[i] + z[zOff + i];
+                z[zOff + i] = (uint)c;
                 c >>= 32;
             }
             while (++i < len);
@@ -244,21 +281,6 @@ namespace Org.BouncyCastle.Math.EC
             return c == 0 ? 0 : Inc(len, z, zOff + 3);
         }
 
-        public static uint MulWordExt(int len, uint x, uint[] y, uint[] zz, int zzOff)
-        {
-            Debug.Assert(zzOff <= len);
-            ulong c = 0, xVal = (ulong)x;
-            int i = 0;
-            do
-            {
-                c += xVal * y[i];
-                zz[zzOff + i] = (uint)c;
-                c >>= 32;
-            }
-            while (++i < len);
-            return (uint)c;
-        }
-
         public static uint ShiftDownBit(int len, uint[] z, uint c)
         {
             int i = len;
@@ -296,13 +318,13 @@ namespace Org.BouncyCastle.Math.EC
             return c << -bits;
         }
 
-        public static uint ShiftDownBitsExt(int len, uint[] xx, int xxOff, int bits, uint c, uint[] z)
+        public static uint ShiftDownBits(int len, uint[] x, int xOff, int bits, uint c, uint[] z)
         {
             Debug.Assert(bits > 0 && bits < 32);
             int i = len;
             while (--i >= 0)
             {
-                uint next = xx[xxOff + i];
+                uint next = x[xOff + i];
                 z[i] = (next >> bits) | (c << -bits);
                 c = next;
             }
@@ -430,6 +452,18 @@ namespace Org.BouncyCastle.Math.EC
             return (int)c;
         }
 
+        public static int SubBothFrom(int len, uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            long c = 0;
+            for (int i = 0; i < len; ++i)
+            {
+                c += (long)z[zOff + i] - x[xOff + i] - y[yOff + i];
+                z[zOff + i] = (uint)c;
+                c >>= 32;
+            }
+            return (int)c;
+        }
+
         // TODO Re-write to allow full range for x?
         public static int SubDWord(int len, ulong x, uint[] z)
         {
@@ -444,14 +478,13 @@ namespace Org.BouncyCastle.Math.EC
             return c == 0 ? 0 : Dec(len, z, 2);
         }
 
-        public static int SubFromExt(int len, uint[] x, int xOff, uint[] zz, int zzOff)
+        public static int SubFrom(int len, uint[] x, int xOff, uint[] z, int zOff)
         {
-            Debug.Assert(zzOff <= len);
             long c = 0;
             for (int i = 0; i < len; ++i)
             {
-                c += (long)zz[zzOff + i] - x[xOff + i];
-                zz[zzOff + i] = (uint)c;
+                c += (long)z[zOff + i] - x[xOff + i];
+                z[zOff + i] = (uint)c;
                 c >>= 32;
             }
             return (int)c;
diff --git a/crypto/src/math/ec/custom/sec/Nat256.cs b/crypto/src/math/ec/custom/sec/Nat256.cs
index 40e287b9b..3bd329251 100644
--- a/crypto/src/math/ec/custom/sec/Nat256.cs
+++ b/crypto/src/math/ec/custom/sec/Nat256.cs
@@ -39,6 +39,36 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return (uint)c;
         }
 
+        public static uint Add(uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            ulong c = 0;
+            c += (ulong)x[xOff + 0] + y[yOff + 0];
+            z[zOff + 0] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 1] + y[yOff + 1];
+            z[zOff + 1] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 2] + y[yOff + 2];
+            z[zOff + 2] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 3] + y[yOff + 3];
+            z[zOff + 3] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 4] + y[yOff + 4];
+            z[zOff + 4] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 5] + y[yOff + 5];
+            z[zOff + 5] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 6] + y[yOff + 6];
+            z[zOff + 6] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 7] + y[yOff + 7];
+            z[zOff + 7] = (uint)c;
+            c >>= 32;
+            return (uint)c;
+        }
+
         public static uint AddBothTo(uint[] x, uint[] y, uint[] z)
         {
             ulong c = 0;
@@ -69,6 +99,36 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return (uint)c;
         }
 
+        public static uint AddBothTo(uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            ulong c = 0;
+            c += (ulong)x[xOff + 0] + y[yOff + 0] + z[zOff + 0];
+            z[zOff + 0] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 1] + y[yOff + 1] + z[zOff + 1];
+            z[zOff + 1] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 2] + y[yOff + 2] + z[zOff + 2];
+            z[zOff + 2] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 3] + y[yOff + 3] + z[zOff + 3];
+            z[zOff + 3] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 4] + y[yOff + 4] + z[zOff + 4];
+            z[zOff + 4] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 5] + y[yOff + 5] + z[zOff + 5];
+            z[zOff + 5] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 6] + y[yOff + 6] + z[zOff + 6];
+            z[zOff + 6] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 7] + y[yOff + 7] + z[zOff + 7];
+            z[zOff + 7] = (uint)c;
+            c >>= 32;
+            return (uint)c;
+        }
+
         // TODO Re-write to allow full range for x?
         public static uint AddDWord(ulong x, uint[] z, int zOff)
         {
@@ -95,33 +155,70 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return (uint)c;
         }
 
-        public static uint AddToExt(uint[] x, int xOff, uint[] zz, int zzOff)
+        public static uint AddTo(uint[] x, int xOff, uint[] z, int zOff, uint cIn)
+        {
+            ulong c = cIn;
+            c += (ulong)x[xOff + 0] + z[zOff + 0];
+            z[zOff + 0] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 1] + z[zOff + 1];
+            z[zOff + 1] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 2] + z[zOff + 2];
+            z[zOff + 2] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 3] + z[zOff + 3];
+            z[zOff + 3] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 4] + z[zOff + 4];
+            z[zOff + 4] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 5] + z[zOff + 5];
+            z[zOff + 5] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 6] + z[zOff + 6];
+            z[zOff + 6] = (uint)c;
+            c >>= 32;
+            c += (ulong)x[xOff + 7] + z[zOff + 7];
+            z[zOff + 7] = (uint)c;
+            c >>= 32;
+            return (uint)c;
+        }
+
+        public static uint AddToEachOther(uint[] u, int uOff, uint[] v, int vOff)
         {
-            Debug.Assert(zzOff <= 8);
             ulong c = 0;
-            c += (ulong)x[xOff + 0] + zz[zzOff + 0];
-            zz[zzOff + 0] = (uint)c;
+            c += (ulong)u[uOff + 0] + v[vOff + 0];
+            u[uOff + 0] = (uint)c;
+            v[vOff + 0] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 1] + zz[zzOff + 1];
-            zz[zzOff + 1] = (uint)c;
+            c += (ulong)u[uOff + 1] + v[vOff + 1];
+            u[uOff + 1] = (uint)c;
+            v[vOff + 1] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 2] + zz[zzOff + 2];
-            zz[zzOff + 2] = (uint)c;
+            c += (ulong)u[uOff + 2] + v[vOff + 2];
+            u[uOff + 2] = (uint)c;
+            v[vOff + 2] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 3] + zz[zzOff + 3];
-            zz[zzOff + 3] = (uint)c;
+            c += (ulong)u[uOff + 3] + v[vOff + 3];
+            u[uOff + 3] = (uint)c;
+            v[vOff + 3] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 4] + zz[zzOff + 4];
-            zz[zzOff + 4] = (uint)c;
+            c += (ulong)u[uOff + 4] + v[vOff + 4];
+            u[uOff + 4] = (uint)c;
+            v[vOff + 4] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 5] + zz[zzOff + 5];
-            zz[zzOff + 5] = (uint)c;
+            c += (ulong)u[uOff + 5] + v[vOff + 5];
+            u[uOff + 5] = (uint)c;
+            v[vOff + 5] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 6] + zz[zzOff + 6];
-            zz[zzOff + 6] = (uint)c;
+            c += (ulong)u[uOff + 6] + v[vOff + 6];
+            u[uOff + 6] = (uint)c;
+            v[vOff + 6] = (uint)c;
             c >>= 32;
-            c += (ulong)x[xOff + 7] + zz[zzOff + 7];
-            zz[zzOff + 7] = (uint)c;
+            c += (ulong)u[uOff + 7] + v[vOff + 7];
+            u[uOff + 7] = (uint)c;
+            v[vOff + 7] = (uint)c;
             c >>= 32;
             return (uint)c;
         }
@@ -158,6 +255,20 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return -1;
         }
 
+        public static bool Diff(uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            bool pos = Gte(x, xOff, y, yOff);
+            if (pos)
+            {
+                Sub(x, xOff, y, yOff, z, zOff);
+            }
+            else
+            {
+                Sub(y, yOff, x, xOff, z, zOff);
+            }
+            return pos;
+        }
+
         public static uint[] FromBigInteger(BigInteger x)
         {
             if (x.SignValue < 0 || x.BitLength > 256)
@@ -201,6 +312,19 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return true;
         }
 
+        public static bool Gte(uint[] x, int xOff, uint[] y, int yOff)
+        {
+            for (int i = 7; i >= 0; --i)
+            {
+                uint x_i = x[xOff + i], y_i = y[yOff + i];
+                if (x_i < y_i)
+                    return false;
+                if (x_i > y_i)
+                    return true;
+            }
+            return true;
+        }
+
         public static bool GteExt(uint[] xx, uint[] yy)
         {
             for (int i = 15; i >= 0; --i)
@@ -351,6 +475,125 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             }
         }
 
+        public static void Mul(uint[] x, int xOff, uint[] y, int yOff, uint[] zz, int zzOff)
+        {
+            ulong y_0 = y[yOff + 0];
+            ulong y_1 = y[yOff + 1];
+            ulong y_2 = y[yOff + 2];
+            ulong y_3 = y[yOff + 3];
+            ulong y_4 = y[yOff + 4];
+            ulong y_5 = y[yOff + 5];
+            ulong y_6 = y[yOff + 6];
+            ulong y_7 = y[yOff + 7];
+
+            {
+                ulong c = 0, x_0 = x[xOff + 0];
+                c += x_0 * y_0;
+                zz[zzOff + 0] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_1;
+                zz[zzOff + 1] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_2;
+                zz[zzOff + 2] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_3;
+                zz[zzOff + 3] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_4;
+                zz[zzOff + 4] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_5;
+                zz[zzOff + 5] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_6;
+                zz[zzOff + 6] = (uint)c;
+                c >>= 32;
+                c += x_0 * y_7;
+                zz[zzOff + 7] = (uint)c;
+                c >>= 32;
+                zz[zzOff + 8] = (uint)c;
+            }
+
+            for (int i = 1; i < 8; ++i)
+            {
+                ++zzOff;
+                ulong c = 0, x_i = x[xOff + i];
+                c += x_i * y_0 + zz[zzOff + 0];
+                zz[zzOff + 0] = (uint)c;
+                c >>= 32;
+                c += x_i * y_1 + zz[zzOff + 1];
+                zz[zzOff + 1] = (uint)c;
+                c >>= 32;
+                c += x_i * y_2 + zz[zzOff + 2];
+                zz[zzOff + 2] = (uint)c;
+                c >>= 32;
+                c += x_i * y_3 + zz[zzOff + 3];
+                zz[zzOff + 3] = (uint)c;
+                c >>= 32;
+                c += x_i * y_4 + zz[zzOff + 4];
+                zz[zzOff + 4] = (uint)c;
+                c >>= 32;
+                c += x_i * y_5 + zz[zzOff + 5];
+                zz[zzOff + 5] = (uint)c;
+                c >>= 32;
+                c += x_i * y_6 + zz[zzOff + 6];
+                zz[zzOff + 6] = (uint)c;
+                c >>= 32;
+                c += x_i * y_7 + zz[zzOff + 7];
+                zz[zzOff + 7] = (uint)c;
+                c >>= 32;
+                zz[zzOff + 8] = (uint)c;
+            }
+        }
+
+        public static uint MulAdd(uint[] x, int xOff, uint[] y, int yOff, uint[] zz, int zzOff)
+        {
+            ulong y_0 = y[yOff + 0];
+            ulong y_1 = y[yOff + 1];
+            ulong y_2 = y[yOff + 2];
+            ulong y_3 = y[yOff + 3];
+            ulong y_4 = y[yOff + 4];
+            ulong y_5 = y[yOff + 5];
+            ulong y_6 = y[yOff + 6];
+            ulong y_7 = y[yOff + 7];
+
+            ulong zc = 0;
+            for (int i = 0; i < 8; ++i)
+            {
+                ulong c = 0, x_i = x[xOff + i];
+                c += x_i * y_0 + zz[zzOff + 0];
+                zz[zzOff + 0] = (uint)c;
+                c >>= 32;
+                c += x_i * y_1 + zz[zzOff + 1];
+                zz[zzOff + 1] = (uint)c;
+                c >>= 32;
+                c += x_i * y_2 + zz[zzOff + 2];
+                zz[zzOff + 2] = (uint)c;
+                c >>= 32;
+                c += x_i * y_3 + zz[zzOff + 3];
+                zz[zzOff + 3] = (uint)c;
+                c >>= 32;
+                c += x_i * y_4 + zz[zzOff + 4];
+                zz[zzOff + 4] = (uint)c;
+                c >>= 32;
+                c += x_i * y_5 + zz[zzOff + 5];
+                zz[zzOff + 5] = (uint)c;
+                c >>= 32;
+                c += x_i * y_6 + zz[zzOff + 6];
+                zz[zzOff + 6] = (uint)c;
+                c >>= 32;
+                c += x_i * y_7 + zz[zzOff + 7];
+                zz[zzOff + 7] = (uint)c;
+                c >>= 32;
+                c += zc + zz[zzOff + 8];
+                zz[zzOff + 8] = (uint)c;
+                zc = c >> 32;
+                ++zzOff;
+            }
+            return (uint)zc;
+        }
+
         public static ulong Mul33AddExt(uint w, uint[] xx, int xxOff, uint[] yy, int yyOff, uint[] zz, int zzOff)
         {
             Debug.Assert(w >> 31 == 0);
@@ -539,6 +782,17 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return c >> 31;
         }
 
+        public static uint ShiftUpBit(uint[] x, int xOff, int xLen, uint c)
+        {
+            for (int i = 0; i < xLen; ++i)
+            {
+                uint next = x[xOff + i];
+                x[xOff + i] = (next << 1) | (c >> 31);
+                c = next;
+            }
+            return c >> 31;
+        }
+
         public static uint ShiftUpBit(uint[] x, uint c, uint[] z)
         {
             for (int i = 0; i < 8; ++i)
@@ -686,6 +940,142 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ShiftUpBit(zz, 16, (uint)x_0 << 31);
         }
 
+        public static void Square(uint[] x, int xOff, uint[] zz, int zzOff)
+        {
+            ulong x_0 = x[xOff + 0];
+            ulong zz_1;
+
+            {
+                uint c = 0;
+                int i = 7, j = 16;
+                do
+                {
+                    ulong xVal = x[xOff + i--];
+                    ulong p = xVal * xVal;
+                    zz[zzOff + --j] = (c << 31) | (uint)(p >> 33);
+                    zz[zzOff + --j] = (uint)(p >> 1);
+                    c = (uint)p;
+                }
+                while (i > 0);
+
+                {
+                    ulong p = x_0 * x_0;
+                    zz_1 = (ulong)(c << 31) | (p >> 33);
+                    zz[zzOff + 0] = (uint)(p >> 1);
+                }
+            }
+
+            ulong x_1 = x[xOff + 1];
+            ulong zz_2 = zz[zzOff + 2];
+
+            {
+                zz_1 += x_1 * x_0;
+                zz[zzOff + 1] = (uint)zz_1;
+                zz_2 += zz_1 >> 32;
+            }
+
+            ulong x_2 = x[xOff + 2];
+            ulong zz_3 = zz[zzOff + 3];
+            ulong zz_4 = zz[zzOff + 4];
+            {
+                zz_2 += x_2 * x_0;
+                zz[zzOff + 2] = (uint)zz_2;
+                zz_3 += (zz_2 >> 32) + x_2 * x_1;
+                zz_4 += zz_3 >> 32;
+                zz_3 &= M;
+            }
+
+            ulong x_3 = x[xOff + 3];
+            ulong zz_5 = zz[zzOff + 5];
+            ulong zz_6 = zz[zzOff + 6];
+            {
+                zz_3 += x_3 * x_0;
+                zz[zzOff + 3] = (uint)zz_3;
+                zz_4 += (zz_3 >> 32) + x_3 * x_1;
+                zz_5 += (zz_4 >> 32) + x_3 * x_2;
+                zz_4 &= M;
+                zz_6 += zz_5 >> 32;
+                zz_5 &= M;
+            }
+
+            ulong x_4 = x[xOff + 4];
+            ulong zz_7 = zz[zzOff + 7];
+            ulong zz_8 = zz[zzOff + 8];
+            {
+                zz_4 += x_4 * x_0;
+                zz[zzOff + 4] = (uint)zz_4;
+                zz_5 += (zz_4 >> 32) + x_4 * x_1;
+                zz_6 += (zz_5 >> 32) + x_4 * x_2;
+                zz_5 &= M;
+                zz_7 += (zz_6 >> 32) + x_4 * x_3;
+                zz_6 &= M;
+                zz_8 += zz_7 >> 32;
+                zz_7 &= M;
+            }
+
+            ulong x_5 = x[xOff + 5];
+            ulong zz_9 = zz[zzOff + 9];
+            ulong zz_10 = zz[zzOff + 10];
+            {
+                zz_5 += x_5 * x_0;
+                zz[zzOff + 5] = (uint)zz_5;
+                zz_6 += (zz_5 >> 32) + x_5 * x_1;
+                zz_7 += (zz_6 >> 32) + x_5 * x_2;
+                zz_6 &= M;
+                zz_8 += (zz_7 >> 32) + x_5 * x_3;
+                zz_7 &= M;
+                zz_9 += (zz_8 >> 32) + x_5 * x_4;
+                zz_8 &= M;
+                zz_10 += zz_9 >> 32;
+                zz_9 &= M;
+            }
+
+            ulong x_6 = x[xOff + 6];
+            ulong zz_11 = zz[zzOff + 11];
+            ulong zz_12 = zz[zzOff + 12];
+            {
+                zz_6 += x_6 * x_0;
+                zz[zzOff + 6] = (uint)zz_6;
+                zz_7 += (zz_6 >> 32) + x_6 * x_1;
+                zz_8 += (zz_7 >> 32) + x_6 * x_2;
+                zz_7 &= M;
+                zz_9 += (zz_8 >> 32) + x_6 * x_3;
+                zz_8 &= M;
+                zz_10 += (zz_9 >> 32) + x_6 * x_4;
+                zz_9 &= M;
+                zz_11 += (zz_10 >> 32) + x_6 * x_5;
+                zz_10 &= M;
+                zz_12 += zz_11 >> 32;
+                zz_11 &= M;
+            }
+
+            ulong x_7 = x[xOff + 7];
+            ulong zz_13 = zz[zzOff + 13];
+            ulong zz_14 = zz[zzOff + 14];
+            {
+                zz_7 += x_7 * x_0;
+                zz[zzOff + 7] = (uint)zz_7;
+                zz_8 += (zz_7 >> 32) + x_7 * x_1;
+                zz_9 += (zz_8 >> 32) + x_7 * x_2;
+                zz_10 += (zz_9 >> 32) + x_7 * x_3;
+                zz_11 += (zz_10 >> 32) + x_7 * x_4;
+                zz_12 += (zz_11 >> 32) + x_7 * x_5;
+                zz_13 += (zz_12 >> 32) + x_7 * x_6;
+                zz_14 += zz_13 >> 32;
+            }
+
+            zz[zzOff + 8] = (uint)zz_8;
+            zz[zzOff + 9] = (uint)zz_9;
+            zz[zzOff + 10] = (uint)zz_10;
+            zz[zzOff + 11] = (uint)zz_11;
+            zz[zzOff + 12] = (uint)zz_12;
+            zz[zzOff + 13] = (uint)zz_13;
+            zz[zzOff + 14] = (uint)zz_14;
+            zz[zzOff + 15] += (uint)(zz_14 >> 32);
+
+            ShiftUpBit(zz, zzOff, 16, (uint)x_0 << 31);
+        }
+
         public static uint SquareWordAddExt(uint[] x, int xPos, uint[] zz)
         {
             Debug.Assert(xPos > 0 && xPos < 8);
@@ -731,6 +1121,36 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return (int)c;
         }
 
+        public static int Sub(uint[] x, int xOff, uint[] y, int yOff, uint[] z, int zOff)
+        {
+            long c = 0;
+            c += (long)x[xOff + 0] - y[yOff + 0];
+            z[zOff + 0] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 1] - y[yOff + 1];
+            z[zOff + 1] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 2] - y[yOff + 2];
+            z[zOff + 2] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 3] - y[yOff + 3];
+            z[zOff + 3] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 4] - y[yOff + 4];
+            z[zOff + 4] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 5] - y[yOff + 5];
+            z[zOff + 5] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 6] - y[yOff + 6];
+            z[zOff + 6] = (uint)c;
+            c >>= 32;
+            c += (long)x[xOff + 7] - y[yOff + 7];
+            z[zOff + 7] = (uint)c;
+            c >>= 32;
+            return (int)c;
+        }
+
         public static int SubBothFrom(uint[] x, uint[] y, uint[] z)
         {
             long c = 0;
diff --git a/crypto/src/math/ec/custom/sec/Nat512.cs b/crypto/src/math/ec/custom/sec/Nat512.cs
new file mode 100644
index 000000000..7f1475306
--- /dev/null
+++ b/crypto/src/math/ec/custom/sec/Nat512.cs
@@ -0,0 +1,46 @@
+using System;
+using System.Diagnostics;
+
+namespace Org.BouncyCastle.Math.EC.Custom.Sec
+{
+    internal abstract class Nat512
+    {
+        public static void Mul(uint[] x, uint[] y, uint[] zz)
+        {
+            Nat256.Mul(x, y, zz);
+            Nat256.Mul(x, 8, y, 8, zz, 16);
+
+            uint c24 = Nat256.AddToEachOther(zz, 8, zz, 16);
+            uint c16 = c24 + Nat256.AddTo(zz, 0, zz, 8, 0);
+            c24 += Nat256.AddTo(zz, 24, zz, 16, c16);
+
+            uint[] dx = Nat256.Create(), dy = Nat256.Create();
+            bool neg = Nat256.Diff(x, 8, x, 0, dx, 0) != Nat256.Diff(y, 8, y, 0, dy, 0);
+
+            uint[] tt = Nat256.CreateExt();
+            Nat256.Mul(dx, dy, tt);
+
+            c24 += neg ? Nat.AddTo(16, tt, 0, zz, 8) : (uint)Nat.SubFrom(16, tt, 0, zz, 8);
+            Nat.AddWordExt(16, c24, zz, 24); 
+        }
+
+        public static void Square(uint[] x, uint[] zz)
+        {
+            Nat256.Square(x, zz);
+            Nat256.Square(x, 8, zz, 16);
+
+            uint c24 = Nat256.AddToEachOther(zz, 8, zz, 16);
+            uint c16 = c24 + Nat256.AddTo(zz, 0, zz, 8, 0);
+            c24 += Nat256.AddTo(zz, 24, zz, 16, c16);
+
+            uint[] dx = Nat256.Create();
+            Nat256.Diff(x, 8, x, 0, dx, 0);
+
+            uint[] m = Nat256.CreateExt();
+            Nat256.Square(dx, m);
+
+            c24 += (uint)Nat.SubFrom(16, m, 0, zz, 8);
+            Nat.AddWordExt(16, c24, zz, 24); 
+        }
+    }
+}
diff --git a/crypto/src/math/ec/custom/sec/SecP521R1Field.cs b/crypto/src/math/ec/custom/sec/SecP521R1Field.cs
index f39a0daa6..38b177c0d 100644
--- a/crypto/src/math/ec/custom/sec/SecP521R1Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecP521R1Field.cs
@@ -24,7 +24,7 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
         public static void AddOne(uint[] x, uint[] z)
         {
             Array.Copy(x, 0, z, 0, 16);
-            uint c = Nat.Inc(16, z, 0) + z[16];
+            uint c = Nat.Inc(16, z, 0) + x[16];
             if (c > P16 || (c == P16 && Nat.Eq(16, z, P)))
             {
                 c += Nat.Inc(16, z, 0);
@@ -45,15 +45,15 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         public static void Half(uint[] x, uint[] z)
         {
-            uint c0 = x[0] & 1, x16 = x[16], c512 = x16 & 1;
-            Nat.ShiftDownBit(16, x, c512, z);
-            z[16] = (x16 >> 1) | (c0 << 8);
+            uint x16 = x[16];
+            uint c = Nat.ShiftDownBit(16, x, x16, z);
+            z[16] = (x16 >> 1) | (c >> 23);
         }
 
         public static void Multiply(uint[] x, uint[] y, uint[] z)
         {
-            uint[] tt = Nat.Create(34);
-            Nat.Mul(17, x, y, tt);
+            uint[] tt = Nat.Create(33);
+            ImplMultiply(x, y, tt);
             Reduce(tt, z);
         }
 
@@ -71,10 +71,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         public static void Reduce(uint[] xx, uint[] z)
         {
-            Debug.Assert(xx[33] == 0);
             Debug.Assert(xx[32] >> 18 == 0);
             uint xx32 = xx[32];
-            uint c = Nat.ShiftDownBitsExt(16, xx, 16, 9, xx32, z) >> 23;
+            uint c = Nat.ShiftDownBits(16, xx, 16, 9, xx32, z) >> 23;
             c += xx32 >> 9;
             c += Nat.Add(16, z, xx, z);
             if (c > P16 || (c == P16 && Nat.Eq(16, z, P)))
@@ -99,21 +98,21 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         public static void Square(uint[] x, uint[] z)
         {
-            uint[] tt = Nat.Create(34);
-            Nat.Square(17, x, tt);
+            uint[] tt = Nat.Create(33);
+            ImplSquare(x, tt);
             Reduce(tt, z);
         }
 
         public static void SquareN(uint[] x, int n, uint[] z)
         {
             Debug.Assert(n > 0);
-            uint[] tt = Nat.Create(34);
-            Nat.Square(17, x, tt);
+            uint[] tt = Nat.Create(33);
+            ImplSquare(x, tt);
             Reduce(tt, z);
 
             while (--n > 0)
             {
-                Nat.Square(17, z, tt);
+                ImplSquare(z, tt);
                 Reduce(tt, z);
             }
         }
@@ -139,5 +138,21 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             }
             z[16] = c;
         }
+
+        protected static void ImplMultiply(uint[] x, uint[] y, uint[] zz)
+        {
+            Nat512.Mul(x, y, zz);
+
+            uint x16 = x[16], y16 = y[16];
+            zz[32] = Nat.Mul31BothAdd(16, x16, y, y16, x, zz, 16) + (x16 * y16);
+        }
+
+        protected static void ImplSquare(uint[] x, uint[] zz)
+        {
+            Nat512.Square(x, zz);
+
+            uint x16 = x[16];
+            zz[32] = Nat.MulWordAdd(16, x16 << 1, x, zz, 16) + (x16 * x16);
+        }
     }
 }