summary refs log tree commit diff
path: root/crypto/src/math/ec
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2014-03-14 13:14:11 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2014-03-14 13:14:11 +0700
commit8e46418932e542e89d450360aade2ae257b8b5cb (patch)
tree6e820395ddfc46b9918f328b41e1d916458e81aa /crypto/src/math/ec
parentGlvMultiplier.cs missed in last commit (diff)
downloadBouncyCastle.NET-ed25519-8e46418932e542e89d450360aade2ae257b8b5cb.tar.xz
Optimize squaring by inlining the ShiftUpBit
Diffstat (limited to 'crypto/src/math/ec')
-rw-r--r--crypto/src/math/ec/custom/sec/Nat192.cs100
-rw-r--r--crypto/src/math/ec/custom/sec/Nat224.cs116
-rw-r--r--crypto/src/math/ec/custom/sec/Nat256.cs164
3 files changed, 262 insertions, 118 deletions
diff --git a/crypto/src/math/ec/custom/sec/Nat192.cs b/crypto/src/math/ec/custom/sec/Nat192.cs
index 87dbcec84..94d7ed17c 100644
--- a/crypto/src/math/ec/custom/sec/Nat192.cs
+++ b/crypto/src/math/ec/custom/sec/Nat192.cs
@@ -588,8 +588,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 5, j = 12;
                 do
                 {
@@ -604,7 +604,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[0] = (uint)(p >> 1);
+                    zz[0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -613,7 +614,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -622,7 +625,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[4];
             {
                 zz_2 += x_2 * x_0;
-                zz[2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -633,7 +638,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[6];
             {
                 zz_3 += x_3 * x_0;
-                zz[3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -646,7 +653,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[8];
             {
                 zz_4 += x_4 * x_0;
-                zz[4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -661,7 +670,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[10];
             {
                 zz_5 += x_5 * x_0;
-                zz[5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_8 += (zz_7 >> 32) + x_5 * x_3;
@@ -669,14 +680,23 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_10 += zz_9 >> 32;
             }
 
-            zz[6] = (uint)zz_6;
-            zz[7] = (uint)zz_7;
-            zz[8] = (uint)zz_8;
-            zz[9] = (uint)zz_9;
-            zz[10] = (uint)zz_10;
-            zz[11] += (uint)(zz_10 >> 32);
-
-            Nat.ShiftUpBit(12, zz, (uint)x_0 << 31);
+            w = (uint)zz_6;
+            zz[6] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_7;
+            zz[7] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_8;
+            zz[8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[10] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[11] + (uint)(zz_10 >> 32);
+            zz[11] = (w << 1) | c;
         }
 
         public static void Square(uint[] x, int xOff, uint[] zz, int zzOff)
@@ -684,8 +704,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[xOff + 0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 5, j = 12;
                 do
                 {
@@ -700,7 +720,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[zzOff + 0] = (uint)(p >> 1);
+                    zz[zzOff + 0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -709,7 +730,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[zzOff + 1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[zzOff + 1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -718,7 +741,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[zzOff + 4];
             {
                 zz_2 += x_2 * x_0;
-                zz[zzOff + 2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[zzOff + 2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -729,7 +754,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[zzOff + 6];
             {
                 zz_3 += x_3 * x_0;
-                zz[zzOff + 3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[zzOff + 3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -742,7 +769,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[zzOff + 8];
             {
                 zz_4 += x_4 * x_0;
-                zz[zzOff + 4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[zzOff + 4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -757,7 +786,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[zzOff + 10];
             {
                 zz_5 += x_5 * x_0;
-                zz[zzOff + 5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[zzOff + 5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_8 += (zz_7 >> 32) + x_5 * x_3;
@@ -765,14 +796,23 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_10 += zz_9 >> 32;
             }
 
-            zz[zzOff + 6] = (uint)zz_6;
-            zz[zzOff + 7] = (uint)zz_7;
-            zz[zzOff + 8] = (uint)zz_8;
-            zz[zzOff + 9] = (uint)zz_9;
-            zz[zzOff + 10] = (uint)zz_10;
-            zz[zzOff + 11] += (uint)(zz_10 >> 32);
-
-            Nat.ShiftUpBit(12, zz, zzOff, (uint)x_0 << 31);
+            w = (uint)zz_6;
+            zz[zzOff + 6] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_7;
+            zz[zzOff + 7] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_8;
+            zz[zzOff + 8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[zzOff + 9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[zzOff + 10] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[zzOff + 11] + (uint)(zz_10 >> 32);
+            zz[zzOff + 11] = (w << 1) | c;
         }
 
         public static int Sub(uint[] x, uint[] y, uint[] z)
diff --git a/crypto/src/math/ec/custom/sec/Nat224.cs b/crypto/src/math/ec/custom/sec/Nat224.cs
index 62edc6baf..d5b916a54 100644
--- a/crypto/src/math/ec/custom/sec/Nat224.cs
+++ b/crypto/src/math/ec/custom/sec/Nat224.cs
@@ -740,8 +740,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 6, j = 14;
                 do
                 {
@@ -756,7 +756,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[0] = (uint)(p >> 1);
+                    zz[0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -765,7 +766,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -774,7 +777,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[4];
             {
                 zz_2 += x_2 * x_0;
-                zz[2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -785,7 +790,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[6];
             {
                 zz_3 += x_3 * x_0;
-                zz[3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -798,7 +805,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[8];
             {
                 zz_4 += x_4 * x_0;
-                zz[4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -813,7 +822,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[10];
             {
                 zz_5 += x_5 * x_0;
-                zz[5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_6 &= M;
@@ -830,7 +841,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_12 = zz[12];
             {
                 zz_6 += x_6 * x_0;
-                zz[6] = (uint)zz_6;
+                w = (uint)zz_6;
+                zz[6] = (w << 1) | c;
+                c = w >> 31;
                 zz_7 += (zz_6 >> 32) + x_6 * x_1;
                 zz_8 += (zz_7 >> 32) + x_6 * x_2;
                 zz_9 += (zz_8 >> 32) + x_6 * x_3;
@@ -839,15 +852,26 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_12 += zz_11 >> 32;
             }
 
-            zz[7] = (uint)zz_7;
-            zz[8] = (uint)zz_8;
-            zz[9] = (uint)zz_9;
-            zz[10] = (uint)zz_10;
-            zz[11] = (uint)zz_11;
-            zz[12] = (uint)zz_12;
-            zz[13] += (uint)(zz_12 >> 32);
-
-            Nat.ShiftUpBit(14, zz, (uint)x_0 << 31);
+            w = (uint)zz_7;
+            zz[7] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_8;
+            zz[8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[10] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_11;
+            zz[11] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_12;
+            zz[12] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[13] + (uint)(zz_12 >> 32);
+            zz[13] = (w << 1) | c;
         }
 
         public static void Square(uint[] x, int xOff, uint[] zz, int zzOff)
@@ -855,8 +879,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[xOff + 0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 6, j = 14;
                 do
                 {
@@ -871,7 +895,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[zzOff + 0] = (uint)(p >> 1);
+                    zz[zzOff + 0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -880,7 +905,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[zzOff + 1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[zzOff + 1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -889,7 +916,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[zzOff + 4];
             {
                 zz_2 += x_2 * x_0;
-                zz[zzOff + 2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[zzOff + 2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -900,7 +929,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[zzOff + 6];
             {
                 zz_3 += x_3 * x_0;
-                zz[zzOff + 3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[zzOff + 3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -913,7 +944,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[zzOff + 8];
             {
                 zz_4 += x_4 * x_0;
-                zz[zzOff + 4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[zzOff + 4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -928,7 +961,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[zzOff + 10];
             {
                 zz_5 += x_5 * x_0;
-                zz[zzOff + 5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[zzOff + 5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_6 &= M;
@@ -945,7 +980,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_12 = zz[zzOff + 12];
             {
                 zz_6 += x_6 * x_0;
-                zz[zzOff + 6] = (uint)zz_6;
+                w = (uint)zz_6;
+                zz[zzOff + 6] = (w << 1) | c;
+                c = w >> 31;
                 zz_7 += (zz_6 >> 32) + x_6 * x_1;
                 zz_8 += (zz_7 >> 32) + x_6 * x_2;
                 zz_9 += (zz_8 >> 32) + x_6 * x_3;
@@ -954,15 +991,26 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_12 += zz_11 >> 32;
             }
 
-            zz[zzOff + 7] = (uint)zz_7;
-            zz[zzOff + 8] = (uint)zz_8;
-            zz[zzOff + 9] = (uint)zz_9;
-            zz[zzOff + 10] = (uint)zz_10;
-            zz[zzOff + 11] = (uint)zz_11;
-            zz[zzOff + 12] = (uint)zz_12;
-            zz[zzOff + 13] += (uint)(zz_12 >> 32);
-
-            Nat.ShiftUpBit(14, zz, zzOff, (uint)x_0 << 31);
+            w = (uint)zz_7;
+            zz[zzOff + 7] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_8;
+            zz[zzOff + 8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[zzOff + 9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[zzOff + 10] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_11;
+            zz[zzOff + 11] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_12;
+            zz[zzOff + 12] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[zzOff + 13] + (uint)(zz_12 >> 32);
+            zz[zzOff + 13] = (w << 1) | c;
         }
 
         public static int Sub(uint[] x, uint[] y, uint[] z)
diff --git a/crypto/src/math/ec/custom/sec/Nat256.cs b/crypto/src/math/ec/custom/sec/Nat256.cs
index 93501cd94..bd2d6da47 100644
--- a/crypto/src/math/ec/custom/sec/Nat256.cs
+++ b/crypto/src/math/ec/custom/sec/Nat256.cs
@@ -132,29 +132,29 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
         public static uint AddTo(uint[] x, uint[] z)
         {
             ulong c = 0;
-            c += (ulong)x[ 0] + z[ 0];
-            z[ 0] = (uint)c;
+            c += (ulong)x[0] + z[0];
+            z[0] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 1] + z[ 1];
-            z[ 1] = (uint)c;
+            c += (ulong)x[1] + z[1];
+            z[1] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 2] + z[ 2];
-            z[ 2] = (uint)c;
+            c += (ulong)x[2] + z[2];
+            z[2] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 3] + z[ 3];
-            z[ 3] = (uint)c;
+            c += (ulong)x[3] + z[3];
+            z[3] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 4] + z[ 4];
-            z[ 4] = (uint)c;
+            c += (ulong)x[4] + z[4];
+            z[4] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 5] + z[ 5];
-            z[ 5] = (uint)c;
+            c += (ulong)x[5] + z[5];
+            z[5] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 6] + z[ 6];
-            z[ 6] = (uint)c;
+            c += (ulong)x[6] + z[6];
+            z[6] = (uint)c;
             c >>= 32;
-            c += (ulong)x[ 7] + z[ 7];
-            z[ 7] = (uint)c;
+            c += (ulong)x[7] + z[7];
+            z[7] = (uint)c;
             c >>= 32;
             return (uint)c;
         }
@@ -798,8 +798,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 7, j = 16;
                 do
                 {
@@ -814,7 +814,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[0] = (uint)(p >> 1);
+                    zz[0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -823,7 +824,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -832,7 +835,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[4];
             {
                 zz_2 += x_2 * x_0;
-                zz[2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -843,7 +848,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[6];
             {
                 zz_3 += x_3 * x_0;
-                zz[3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -856,7 +863,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[8];
             {
                 zz_4 += x_4 * x_0;
-                zz[4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -871,7 +880,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[10];
             {
                 zz_5 += x_5 * x_0;
-                zz[5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_6 &= M;
@@ -888,7 +899,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_12 = zz[12];
             {
                 zz_6 += x_6 * x_0;
-                zz[6] = (uint)zz_6;
+                w = (uint)zz_6;
+                zz[6] = (w << 1) | c;
+                c = w >> 31;
                 zz_7 += (zz_6 >> 32) + x_6 * x_1;
                 zz_8 += (zz_7 >> 32) + x_6 * x_2;
                 zz_7 &= M;
@@ -907,7 +920,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_14 = zz[14];
             {
                 zz_7 += x_7 * x_0;
-                zz[7] = (uint)zz_7;
+                w = (uint)zz_7;
+                zz[7] = (w << 1) | c;
+                c = w >> 31;
                 zz_8 += (zz_7 >> 32) + x_7 * x_1;
                 zz_9 += (zz_8 >> 32) + x_7 * x_2;
                 zz_10 += (zz_9 >> 32) + x_7 * x_3;
@@ -917,16 +932,29 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_14 += zz_13 >> 32;
             }
 
-            zz[8] = (uint)zz_8;
-            zz[9] = (uint)zz_9;
-            zz[10] = (uint)zz_10;
-            zz[11] = (uint)zz_11;
-            zz[12] = (uint)zz_12;
-            zz[13] = (uint)zz_13;
-            zz[14] = (uint)zz_14;
-            zz[15] += (uint)(zz_14 >> 32);
-
-            Nat.ShiftUpBit(16, zz, (uint)x_0 << 31);
+            w = (uint)zz_8;
+            zz[8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[10] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_11;
+            zz[11] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_12;
+            zz[12] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_13;
+            zz[13] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_14;
+            zz[14] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[15] + (uint)(zz_14 >> 32);
+            zz[15] = (w << 1) | c;
         }
 
         public static void Square(uint[] x, int xOff, uint[] zz, int zzOff)
@@ -934,8 +962,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong x_0 = x[xOff + 0];
             ulong zz_1;
 
+            uint c = 0, w;
             {
-                uint c = 0;
                 int i = 7, j = 16;
                 do
                 {
@@ -950,7 +978,8 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 {
                     ulong p = x_0 * x_0;
                     zz_1 = (ulong)(c << 31) | (p >> 33);
-                    zz[zzOff + 0] = (uint)(p >> 1);
+                    zz[zzOff + 0] = (uint)p;
+                    c = (uint)(p >> 32) & 1;
                 }
             }
 
@@ -959,7 +988,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             {
                 zz_1 += x_1 * x_0;
-                zz[zzOff + 1] = (uint)zz_1;
+                w = (uint)zz_1;
+                zz[zzOff + 1] = (w << 1) | c;
+                c = w >> 31;
                 zz_2 += zz_1 >> 32;
             }
 
@@ -968,7 +999,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_4 = zz[zzOff + 4];
             {
                 zz_2 += x_2 * x_0;
-                zz[zzOff + 2] = (uint)zz_2;
+                w = (uint)zz_2;
+                zz[zzOff + 2] = (w << 1) | c;
+                c = w >> 31;
                 zz_3 += (zz_2 >> 32) + x_2 * x_1;
                 zz_4 += zz_3 >> 32;
                 zz_3 &= M;
@@ -979,7 +1012,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_6 = zz[zzOff + 6];
             {
                 zz_3 += x_3 * x_0;
-                zz[zzOff + 3] = (uint)zz_3;
+                w = (uint)zz_3;
+                zz[zzOff + 3] = (w << 1) | c;
+                c = w >> 31;
                 zz_4 += (zz_3 >> 32) + x_3 * x_1;
                 zz_5 += (zz_4 >> 32) + x_3 * x_2;
                 zz_4 &= M;
@@ -992,7 +1027,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_8 = zz[zzOff + 8];
             {
                 zz_4 += x_4 * x_0;
-                zz[zzOff + 4] = (uint)zz_4;
+                w = (uint)zz_4;
+                zz[zzOff + 4] = (w << 1) | c;
+                c = w >> 31;
                 zz_5 += (zz_4 >> 32) + x_4 * x_1;
                 zz_6 += (zz_5 >> 32) + x_4 * x_2;
                 zz_5 &= M;
@@ -1007,7 +1044,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_10 = zz[zzOff + 10];
             {
                 zz_5 += x_5 * x_0;
-                zz[zzOff + 5] = (uint)zz_5;
+                w = (uint)zz_5;
+                zz[zzOff + 5] = (w << 1) | c;
+                c = w >> 31;
                 zz_6 += (zz_5 >> 32) + x_5 * x_1;
                 zz_7 += (zz_6 >> 32) + x_5 * x_2;
                 zz_6 &= M;
@@ -1024,7 +1063,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_12 = zz[zzOff + 12];
             {
                 zz_6 += x_6 * x_0;
-                zz[zzOff + 6] = (uint)zz_6;
+                w = (uint)zz_6;
+                zz[zzOff + 6] = (w << 1) | c;
+                c = w >> 31;
                 zz_7 += (zz_6 >> 32) + x_6 * x_1;
                 zz_8 += (zz_7 >> 32) + x_6 * x_2;
                 zz_7 &= M;
@@ -1043,7 +1084,9 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             ulong zz_14 = zz[zzOff + 14];
             {
                 zz_7 += x_7 * x_0;
-                zz[zzOff + 7] = (uint)zz_7;
+                w = (uint)zz_7;
+                zz[zzOff + 7] = (w << 1) | c;
+                c = w >> 31;
                 zz_8 += (zz_7 >> 32) + x_7 * x_1;
                 zz_9 += (zz_8 >> 32) + x_7 * x_2;
                 zz_10 += (zz_9 >> 32) + x_7 * x_3;
@@ -1053,16 +1096,29 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                 zz_14 += zz_13 >> 32;
             }
 
-            zz[zzOff + 8] = (uint)zz_8;
-            zz[zzOff + 9] = (uint)zz_9;
-            zz[zzOff + 10] = (uint)zz_10;
-            zz[zzOff + 11] = (uint)zz_11;
-            zz[zzOff + 12] = (uint)zz_12;
-            zz[zzOff + 13] = (uint)zz_13;
-            zz[zzOff + 14] = (uint)zz_14;
-            zz[zzOff + 15] += (uint)(zz_14 >> 32);
-
-            Nat.ShiftUpBit(16, zz, zzOff, (uint)x_0 << 31);
+            w = (uint)zz_8;
+            zz[zzOff + 8] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_9;
+            zz[zzOff + 9] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_10;
+            zz[zzOff + 10] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_11;
+            zz[zzOff + 11] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_12;
+            zz[zzOff + 12] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_13;
+            zz[zzOff + 13] = (w << 1) | c;
+            c = w >> 31;
+            w = (uint)zz_14;
+            zz[zzOff + 14] = (w << 1) | c;
+            c = w >> 31;
+            w = zz[zzOff + 15] + (uint)(zz_14 >> 32);
+            zz[zzOff + 15] = (w << 1) | c;
         }
 
         public static int Sub(uint[] x, uint[] y, uint[] z)