summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2022-08-04 22:40:39 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2022-08-04 22:40:39 +0700
commit58bcadf7d2c4702ba3ee8fffcc0f05ec720c86eb (patch)
treeb8ed60ac3f1b60fc21a98943fc8d33f7a9c13385
parentAdd span variant for Collect (diff)
downloadBouncyCastle.NET-ed25519-58bcadf7d2c4702ba3ee8fffcc0f05ec720c86eb.tar.xz
Perf. opts. in custom binary curves
-rw-r--r--crypto/src/math/ec/custom/sec/SecT113Field.cs33
-rw-r--r--crypto/src/math/ec/custom/sec/SecT131Field.cs41
-rw-r--r--crypto/src/math/ec/custom/sec/SecT163Field.cs42
-rw-r--r--crypto/src/math/ec/custom/sec/SecT193Field.cs46
-rw-r--r--crypto/src/math/ec/custom/sec/SecT233Field.cs62
-rw-r--r--crypto/src/math/ec/custom/sec/SecT239Field.cs62
-rw-r--r--crypto/src/math/ec/custom/sec/SecT283Field.cs64
7 files changed, 252 insertions, 98 deletions
diff --git a/crypto/src/math/ec/custom/sec/SecT113Field.cs b/crypto/src/math/ec/custom/sec/SecT113Field.cs
index 1b3fcc542..65249562a 100644
--- a/crypto/src/math/ec/custom/sec/SecT113Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT113Field.cs
@@ -170,6 +170,25 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * "Three-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -206,20 +225,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 57 == 0);
             Debug.Assert(y >> 57 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] = z0 & M57;
-                z[zOff + 1] = (z0 >> 57) ^ (z1 << 7);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT131Field.cs b/crypto/src/math/ec/custom/sec/SecT131Field.cs
index 53a9b832f..6088b264c 100644
--- a/crypto/src/math/ec/custom/sec/SecT131Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT131Field.cs
@@ -198,6 +198,33 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X2_ = Vector128.CreateScalar(x[2]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y2_ = Vector128.CreateScalar(y[2]);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x00)));
+                var Z34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x10));
+                var Z4_ =          Pclmulqdq.CarrylessMultiply(X2_, Y2_, 0x00);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ Z34.GetElement(0);
+                zz[4] = Z4_.GetElement(0) ^ Z34.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * "Five-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -305,20 +332,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 45 == 0);
             Debug.Assert(y >> 45 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] = z0 & M44;
-                z[zOff + 1] = (z0 >> 44) ^ (z1 << 20);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT163Field.cs b/crypto/src/math/ec/custom/sec/SecT163Field.cs
index 22d41882f..0c616600a 100644
--- a/crypto/src/math/ec/custom/sec/SecT163Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT163Field.cs
@@ -209,6 +209,34 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X2_ = Vector128.CreateScalar(x[2]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y2_ = Vector128.CreateScalar(y[2]);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x00)));
+                var Z34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x10));
+                var Z45 =          Pclmulqdq.CarrylessMultiply(X2_, Y2_, 0x00);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ Z34.GetElement(0);
+                zz[4] = Z45.GetElement(0) ^ Z34.GetElement(1);
+                zz[5] = Z45.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * "Five-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -316,20 +344,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 56 == 0);
             Debug.Assert(y >> 56 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] = z0 & M55;
-                z[zOff + 1] = (z0 >> 55) ^ (z1 << 9);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT193Field.cs b/crypto/src/math/ec/custom/sec/SecT193Field.cs
index 795e4fb35..4aa3ad5c2 100644
--- a/crypto/src/math/ec/custom/sec/SecT193Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT193Field.cs
@@ -230,6 +230,38 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X2_ = Vector128.CreateScalar(x[2]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y2_ = Vector128.CreateScalar(y[2]);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x00)));
+                var Z34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y2_, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X2_, Y01, 0x10));
+                var Z45 =          Pclmulqdq.CarrylessMultiply(X2_, Y2_, 0x00);
+
+                ulong X3M = 0UL - x[3];
+                ulong Y3M = 0UL - y[3];
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ Z34.GetElement(0) ^ (X3M & y[0]) ^ (x[0] & Y3M);
+                zz[4] = Z45.GetElement(0) ^ Z34.GetElement(1) ^ (X3M & y[1]) ^ (x[1] & Y3M);
+                zz[5] = Z45.GetElement(1)                     ^ (X3M & y[2]) ^ (x[2] & Y3M);
+                zz[6] =                                          X3M & y[3];
+                return;
+            }
+#endif
+
             /*
              * "Two-level seven-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -283,20 +315,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 49 == 0);
             Debug.Assert(y >> 49 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] ^= z0 & M49;
-                z[zOff + 1] ^= (z0 >> 49) ^ (z1 << 15);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT233Field.cs b/crypto/src/math/ec/custom/sec/SecT233Field.cs
index 08819f5ac..e4e291154 100644
--- a/crypto/src/math/ec/custom/sec/SecT233Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT233Field.cs
@@ -241,6 +241,54 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X23 = Vector128.Create(x[2], x[3]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y23 = Vector128.Create(y[2], y[3]);
+                var X03 = Sse2.Xor(X01, X23);
+                var Y03 = Sse2.Xor(Y01, Y23);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11);
+
+                var Z45 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x00);
+                var Z56 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X23, Y23, 0x10));
+                var Z67 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x11);
+
+                var K01 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x00);
+                var K12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X03, Y03, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X03, Y03, 0x10));
+                var K23 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x11);
+
+                K01 = Sse2.Xor(K01, Z01);
+                K12 = Sse2.Xor(K12, Z12);
+                K23 = Sse2.Xor(K23, Z23);
+
+                K01 = Sse2.Xor(K01, Z45);
+                K12 = Sse2.Xor(K12, Z56);
+                K23 = Sse2.Xor(K23, Z67);
+
+                Z23 = Sse2.Xor(Z23, K01);
+                Z45 = Sse2.Xor(Z45, K23);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ K12.GetElement(0);
+                zz[4] = Z45.GetElement(0) ^ K12.GetElement(1);
+                zz[5] = Z45.GetElement(1) ^ Z56.GetElement(0);
+                zz[6] = Z67.GetElement(0) ^ Z56.GetElement(1);
+                zz[7] = Z67.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * "Two-level seven-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -294,20 +342,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 59 == 0);
             Debug.Assert(y >> 59 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] ^= z0 & M59;
-                z[zOff + 1] ^= (z0 >> 59) ^ (z1 << 5);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT239Field.cs b/crypto/src/math/ec/custom/sec/SecT239Field.cs
index d377667ae..a3851de16 100644
--- a/crypto/src/math/ec/custom/sec/SecT239Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT239Field.cs
@@ -250,6 +250,54 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X23 = Vector128.Create(x[2], x[3]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y23 = Vector128.Create(y[2], y[3]);
+                var X03 = Sse2.Xor(X01, X23);
+                var Y03 = Sse2.Xor(Y01, Y23);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11);
+
+                var Z45 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x00);
+                var Z56 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X23, Y23, 0x10));
+                var Z67 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x11);
+
+                var K01 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x00);
+                var K12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X03, Y03, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X03, Y03, 0x10));
+                var K23 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x11);
+
+                K01 = Sse2.Xor(K01, Z01);
+                K12 = Sse2.Xor(K12, Z12);
+                K23 = Sse2.Xor(K23, Z23);
+
+                K01 = Sse2.Xor(K01, Z45);
+                K12 = Sse2.Xor(K12, Z56);
+                K23 = Sse2.Xor(K23, Z67);
+
+                Z23 = Sse2.Xor(Z23, K01);
+                Z45 = Sse2.Xor(Z45, K23);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ K12.GetElement(0);
+                zz[4] = Z45.GetElement(0) ^ K12.GetElement(1);
+                zz[5] = Z45.GetElement(1) ^ Z56.GetElement(0);
+                zz[6] = Z67.GetElement(0) ^ Z56.GetElement(1);
+                zz[7] = Z67.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * "Two-level seven-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
@@ -303,20 +351,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 60 == 0);
             Debug.Assert(y >> 60 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] ^= z0 & M60;
-                z[zOff + 1] ^= (z0 >> 60) ^ (z1 << 4);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;
diff --git a/crypto/src/math/ec/custom/sec/SecT283Field.cs b/crypto/src/math/ec/custom/sec/SecT283Field.cs
index 2ee96048f..334986452 100644
--- a/crypto/src/math/ec/custom/sec/SecT283Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT283Field.cs
@@ -249,6 +249,56 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
         protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
         {
+#if NETCOREAPP3_0_OR_GREATER
+            if (Pclmulqdq.IsSupported)
+            {
+                var X01 = Vector128.Create(x[0], x[1]);
+                var X23 = Vector128.Create(x[2], x[3]);
+                var X4_ = Vector128.CreateScalar(x[4]);
+                var Y01 = Vector128.Create(y[0], y[1]);
+                var Y23 = Vector128.Create(y[2], y[3]);
+                var Y4_ = Vector128.CreateScalar(y[4]);
+
+                var Z01 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x00);
+                var Z12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
+                var Z23 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y23, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X23, Y01, 0x00)));
+                var Z34 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y23, 0x01),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y23, 0x10),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y01, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X23, Y01, 0x10))));
+                var Z45 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y4_, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y23, 0x11),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y01, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X4_, Y01, 0x00)))));
+                var Z56 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X01, Y4_, 0x01),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x01),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x10),
+                                   Pclmulqdq.CarrylessMultiply(X4_, Y01, 0x10))));
+                var Z67 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y4_, 0x00),
+                          Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x11),
+                                   Pclmulqdq.CarrylessMultiply(X4_, Y23, 0x00)));
+                var Z78 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y4_, 0x01),
+                                   Pclmulqdq.CarrylessMultiply(X4_, Y23, 0x10));
+                var Z89 =          Pclmulqdq.CarrylessMultiply(X4_, Y4_, 0x00);
+
+                zz[0] = Z01.GetElement(0);
+                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
+                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
+                zz[3] = Z23.GetElement(1) ^ Z34.GetElement(0);
+                zz[4] = Z45.GetElement(0) ^ Z34.GetElement(1);
+                zz[5] = Z45.GetElement(1) ^ Z56.GetElement(0);
+                zz[6] = Z67.GetElement(0) ^ Z56.GetElement(1);
+                zz[7] = Z67.GetElement(1) ^ Z78.GetElement(0);
+                zz[8] = Z89.GetElement(0) ^ Z78.GetElement(1);
+                zz[9] = Z89.GetElement(1);
+                return;
+            }
+#endif
+
             /*
              * Formula (17) from "Some New Results on Binary Polynomial Multiplication",
              * Murat Cenk and M. Anwar Hasan.
@@ -377,20 +427,6 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Debug.Assert(x >> 57 == 0);
             Debug.Assert(y >> 57 == 0);
 
-#if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
-            {
-                var X = Vector128.CreateScalar(x);
-                var Y = Vector128.CreateScalar(y);
-                var Z = Pclmulqdq.CarrylessMultiply(X, Y, 0x00);
-                ulong z0 = Z.GetElement(0);
-                ulong z1 = Z.GetElement(1);
-                z[zOff    ] = z0 & M57;
-                z[zOff + 1] = (z0 >> 57) ^ (z1 << 7);
-                return;
-            }
-#endif
-
             //u[0] = 0;
             u[1] = y;
             u[2] = u[1] << 1;