summary refs log tree commit diff
path: root/crypto/src/math/ec/custom/sec/SecT239Field.cs
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2022-12-03 17:25:25 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2022-12-03 17:25:25 +0700
commite576a376b11268ca954b95e1d7e451e283c2ba15 (patch)
tree60ca57e1458c1b6c37a8ffdda3b0d42c551a4984 /crypto/src/math/ec/custom/sec/SecT239Field.cs
parentMissing file from commit (diff)
downloadBouncyCastle.NET-ed25519-e576a376b11268ca954b95e1d7e451e283c2ba15.tar.xz
Binary curve perf. opts.
Diffstat (limited to 'crypto/src/math/ec/custom/sec/SecT239Field.cs')
-rw-r--r--crypto/src/math/ec/custom/sec/SecT239Field.cs268
1 files changed, 234 insertions, 34 deletions
diff --git a/crypto/src/math/ec/custom/sec/SecT239Field.cs b/crypto/src/math/ec/custom/sec/SecT239Field.cs
index d48e1ec2b..b90867b76 100644
--- a/crypto/src/math/ec/custom/sec/SecT239Field.cs
+++ b/crypto/src/math/ec/custom/sec/SecT239Field.cs
@@ -11,12 +11,16 @@ using Org.BouncyCastle.Math.Raw;
 
 namespace Org.BouncyCastle.Math.EC.Custom.Sec
 {
-    internal class SecT239Field
+    internal static class SecT239Field
     {
         private const ulong M47 = ulong.MaxValue >> 17;
         private const ulong M60 = ulong.MaxValue >> 4;
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Add(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> z)
+#else
         public static void Add(ulong[] x, ulong[] y, ulong[] z)
+#endif
         {
             z[0] = x[0] ^ y[0];
             z[1] = x[1] ^ y[1];
@@ -24,7 +28,23 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             z[3] = x[3] ^ y[3];
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void AddBothTo(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> z)
+#else
+        public static void AddBothTo(ulong[] x, ulong[] y, ulong[] z)
+#endif
+        {
+            z[0] ^= x[0] ^ y[0];
+            z[1] ^= x[1] ^ y[1];
+            z[2] ^= x[2] ^ y[2];
+            z[3] ^= x[3] ^ y[3];
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void AddExt(ReadOnlySpan<ulong> xx, ReadOnlySpan<ulong> yy, Span<ulong> zz)
+#else
         public static void AddExt(ulong[] xx, ulong[] yy, ulong[] zz)
+#endif
         {
             zz[0] = xx[0] ^ yy[0];
             zz[1] = xx[1] ^ yy[1];
@@ -36,7 +56,11 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             zz[7] = xx[7] ^ yy[7];
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void AddOne(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void AddOne(ulong[] x, ulong[] z)
+#endif
         {
             z[0] = x[0] ^ 1UL;
             z[1] = x[1];
@@ -44,7 +68,11 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             z[3] = x[3];
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void AddTo(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void AddTo(ulong[] x, ulong[] z)
+#endif
         {
             z[0] ^= x[0];
             z[1] ^= x[1];
@@ -57,9 +85,17 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             return Nat.FromBigInteger64(239, x);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void HalfTrace(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void HalfTrace(ulong[] x, ulong[] z)
+#endif
         {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
 
             Nat256.Copy64(x, z);
             for (int i = 1; i < 239; i += 2)
@@ -72,15 +108,24 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             }
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Invert(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void Invert(ulong[] x, ulong[] z)
+#endif
         {
             if (Nat256.IsZero64(x))
                 throw new InvalidOperationException();
 
             // Itoh-Tsujii inversion
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> t0 = stackalloc ulong[4];
+            Span<ulong> t1 = stackalloc ulong[4];
+#else
             ulong[] t0 = Nat256.Create64();
             ulong[] t1 = Nat256.Create64();
+#endif
 
             Square(x, t0);
             Multiply(t0, x, t0);
@@ -109,21 +154,55 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             Square(t0, z);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Multiply(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> z)
+#else
         public static void Multiply(ulong[] x, ulong[] y, ulong[] z)
+#endif
         {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
             ImplMultiply(x, y, tt);
             Reduce(tt, z);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void MultiplyAddToExt(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> zz)
+#else
         public static void MultiplyAddToExt(ulong[] x, ulong[] y, ulong[] zz)
+#endif
         {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
             ImplMultiply(x, y, tt);
             AddExt(zz, tt, zz);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void MultiplyExt(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> zz)
+#else
+        public static void MultiplyExt(ulong[] x, ulong[] y, ulong[] zz)
+#endif
+        {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            zz[..8].Fill(0UL);
+#else
+            Array.Clear(zz, 0, 8);
+#endif
+            ImplMultiply(x, y, zz);
+        }
+
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Reduce(ReadOnlySpan<ulong> xx, Span<ulong> z)
+#else
         public static void Reduce(ulong[] xx, ulong[] z)
+#endif
         {
             ulong x0 = xx[0], x1 = xx[1], x2 = xx[2], x3 = xx[3];
             ulong x4 = xx[4], x5 = xx[5], x6 = xx[6], x7 = xx[7];
@@ -163,7 +242,11 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             z[zOff + 3]  = z3 & M47;
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Sqrt(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void Sqrt(ulong[] x, ulong[] z)
+#endif
         {
             ulong c0 = Interleave.Unshuffle(x[0], x[1], out ulong e0);
             ulong c1 = Interleave.Unshuffle(x[2], x[3], out ulong e1);
@@ -173,9 +256,14 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             c2  = (c0 >> 49) | (c1 << 15);
             c1 ^=              (c0 << 15);
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+            Span<int> shifts = stackalloc int[] { 39, 120 };
+#else
             ulong[] tt = Nat256.CreateExt64();
-
             int[] shifts = { 39, 120 };
+#endif
+
             for (int i = 0; i < shifts.Length; ++i)
             {
                 int w = shifts[i] >> 6, s = shifts[i] & 63;
@@ -193,30 +281,58 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             z[1] ^= e1;
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void Square(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
         public static void Square(ulong[] x, ulong[] z)
+#endif
         {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
             ImplSquare(x, tt);
             Reduce(tt, z);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void SquareAddToExt(ReadOnlySpan<ulong> x, Span<ulong> zz)
+#else
         public static void SquareAddToExt(ulong[] x, ulong[] zz)
+#endif
         {
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
             ImplSquare(x, tt);
             AddExt(zz, tt, zz);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void SquareExt(ReadOnlySpan<ulong> x, Span<ulong> zz)
+#else
         public static void SquareExt(ulong[] x, ulong[] zz)
+#endif
         {
             ImplSquare(x, zz);
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static void SquareN(ReadOnlySpan<ulong> x, int n, Span<ulong> z)
+#else
         public static void SquareN(ulong[] x, int n, ulong[] z)
+#endif
         {
             Debug.Assert(n > 0);
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Span<ulong> tt = stackalloc ulong[8];
+#else
             ulong[] tt = Nat256.CreateExt64();
+#endif
             ImplSquare(x, tt);
             Reduce(tt, z);
 
@@ -227,13 +343,21 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             }
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        public static uint Trace(ReadOnlySpan<ulong> x)
+#else
         public static uint Trace(ulong[] x)
+#endif
         {
             // Non-zero-trace bits: 0, 81, 162
             return (uint)(x[0] ^ (x[1] >> 17) ^ (x[2] >> 34)) & 1U;
         }
 
-        protected static void ImplCompactExt(ulong[] zz)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void ImplCompactExt(Span<ulong> zz)
+#else
+        private static void ImplCompactExt(ulong[] zz)
+#endif
         {
             ulong z0 = zz[0], z1 = zz[1], z2 = zz[2], z3 = zz[3], z4 = zz[4], z5 = zz[5], z6 = zz[6], z7 = zz[7];
             zz[0] =  z0        ^ (z1 << 60);
@@ -246,7 +370,11 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             zz[7] = (z7 >> 28);
         }
 
-        protected static void ImplExpand(ulong[] x, ulong[] z)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void ImplExpand(ReadOnlySpan<ulong> x, Span<ulong> z)
+#else
+        private static void ImplExpand(ulong[] x, ulong[] z)
+#endif
         {
             ulong x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
             z[0] = x0 & M60;
@@ -255,10 +383,11 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             z[3] = ((x2 >> 52) ^ (x3 << 12));
         }
 
-        protected static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void ImplMultiply(ReadOnlySpan<ulong> x, ReadOnlySpan<ulong> y, Span<ulong> zz)
         {
 #if NETCOREAPP3_0_OR_GREATER
-            if (Pclmulqdq.IsSupported)
+            if (Pclmulqdq.IsSupported && BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<ulong>>() == 16)
             {
                 var X01 = Vector128.Create(x[0], x[1]);
                 var X23 = Vector128.Create(x[2], x[3]);
@@ -272,35 +401,37 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
                                    Pclmulqdq.CarrylessMultiply(X01, Y01, 0x10));
                 var Z23 =          Pclmulqdq.CarrylessMultiply(X01, Y01, 0x11);
 
+                Z01 = Sse2.Xor(Z01, Sse2.ShiftLeftLogical128BitLane (Z12, 8));
+                Z23 = Sse2.Xor(Z23, Sse2.ShiftRightLogical128BitLane(Z12, 8));
+
                 var Z45 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x00);
                 var Z56 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X23, Y23, 0x01),
                                    Pclmulqdq.CarrylessMultiply(X23, Y23, 0x10));
                 var Z67 =          Pclmulqdq.CarrylessMultiply(X23, Y23, 0x11);
 
+                Z45 = Sse2.Xor(Z45, Sse2.ShiftLeftLogical128BitLane (Z56, 8));
+                Z67 = Sse2.Xor(Z67, Sse2.ShiftRightLogical128BitLane(Z56, 8));
+
                 var K01 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x00);
                 var K12 = Sse2.Xor(Pclmulqdq.CarrylessMultiply(X03, Y03, 0x01),
                                    Pclmulqdq.CarrylessMultiply(X03, Y03, 0x10));
                 var K23 =          Pclmulqdq.CarrylessMultiply(X03, Y03, 0x11);
 
-                K01 = Sse2.Xor(K01, Z01);
-                K12 = Sse2.Xor(K12, Z12);
-                K23 = Sse2.Xor(K23, Z23);
+                var T = Sse2.Xor(Z23, Z45);
 
-                K01 = Sse2.Xor(K01, Z45);
-                K12 = Sse2.Xor(K12, Z56);
-                K23 = Sse2.Xor(K23, Z67);
+                Z23 = Sse2.Xor(T, Z01);
+                Z45 = Sse2.Xor(T, Z67);
 
                 Z23 = Sse2.Xor(Z23, K01);
+                Z23 = Sse2.Xor(Z23, Sse2.ShiftLeftLogical128BitLane (K12, 8));
+                Z45 = Sse2.Xor(Z45, Sse2.ShiftRightLogical128BitLane(K12, 8));
                 Z45 = Sse2.Xor(Z45, K23);
 
-                zz[0] = Z01.GetElement(0);
-                zz[1] = Z01.GetElement(1) ^ Z12.GetElement(0);
-                zz[2] = Z23.GetElement(0) ^ Z12.GetElement(1);
-                zz[3] = Z23.GetElement(1) ^ K12.GetElement(0);
-                zz[4] = Z45.GetElement(0) ^ K12.GetElement(1);
-                zz[5] = Z45.GetElement(1) ^ Z56.GetElement(0);
-                zz[6] = Z67.GetElement(0) ^ Z56.GetElement(1);
-                zz[7] = Z67.GetElement(1);
+                Span<byte> zzBytes = MemoryMarshal.AsBytes(zz);
+                MemoryMarshal.Write(zzBytes[0x00..0x10], ref Z01);
+                MemoryMarshal.Write(zzBytes[0x10..0x20], ref Z23);
+                MemoryMarshal.Write(zzBytes[0x20..0x30], ref Z45);
+                MemoryMarshal.Write(zzBytes[0x30..0x40], ref Z67);
                 return;
             }
 #endif
@@ -309,6 +440,56 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
              * "Two-level seven-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
              */
 
+            Span<ulong> f = stackalloc ulong[4], g = stackalloc ulong[4];
+            ImplExpand(x, f);
+            ImplExpand(y, g);
+
+            Span<ulong> u = stackalloc ulong[8];
+
+            ImplMulwAcc(u, f[0], g[0], zz[0..]);
+            ImplMulwAcc(u, f[1], g[1], zz[1..]);
+            ImplMulwAcc(u, f[2], g[2], zz[2..]);
+            ImplMulwAcc(u, f[3], g[3], zz[3..]);
+
+            // U *= (1 - t^n)
+            for (int i = 5; i > 0; --i)
+            {
+                zz[i] ^= zz[i - 1];
+            }
+
+            ImplMulwAcc(u, f[0] ^ f[1], g[0] ^ g[1], zz[1..]);
+            ImplMulwAcc(u, f[2] ^ f[3], g[2] ^ g[3], zz[3..]);
+
+            // V *= (1 - t^2n)
+            for (int i = 7; i > 1; --i)
+            {
+                zz[i] ^= zz[i - 2];
+            }
+
+            // Double-length recursion
+            {
+                ulong c0 = f[0] ^ f[2], c1 = f[1] ^ f[3];
+                ulong d0 = g[0] ^ g[2], d1 = g[1] ^ g[3];
+                ImplMulwAcc(u, c0 ^ c1, d0 ^ d1, zz[3..]);
+                Span<ulong> t = stackalloc ulong[3];
+                ImplMulwAcc(u, c0, d0, t[0..]);
+                ImplMulwAcc(u, c1, d1, t[1..]);
+                ulong t0 = t[0], t1 = t[1], t2 = t[2];
+                zz[2] ^= t0;
+                zz[3] ^= t0 ^ t1;
+                zz[4] ^= t2 ^ t1;
+                zz[5] ^= t2;
+            }
+
+            ImplCompactExt(zz);
+        }
+#else
+        private static void ImplMultiply(ulong[] x, ulong[] y, ulong[] zz)
+        {
+            /*
+             * "Two-level seven-way recursion" as described in "Batch binary Edwards", Daniel J. Bernstein.
+             */
+
             ulong[] f = new ulong[4], g = new ulong[4];
             ImplExpand(x, f);
             ImplExpand(y, g);
@@ -352,8 +533,13 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             ImplCompactExt(zz);
         }
+#endif
 
-        protected static void ImplMulwAcc(ulong[] u, ulong x, ulong y, ulong[] z, int zOff)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void ImplMulwAcc(Span<ulong> u, ulong x, ulong y, Span<ulong> z)
+#else
+        private static void ImplMulwAcc(ulong[] u, ulong x, ulong y, ulong[] z, int zOff)
+#endif
         {
             Debug.Assert(x >> 60 == 0);
             Debug.Assert(y >> 60 == 0);
@@ -368,14 +554,14 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
             u[7] = u[6] ^  y;
 
             uint j = (uint)x;
-            ulong g, h = 0, l = u[j & 7]
-                              ^ (u[(j >> 3) & 7] << 3);
+            ulong g, h = 0, l = u[(int)j & 7]
+                              ^ (u[(int)(j >> 3) & 7] << 3);
             int k = 54;
             do
             {
                 j  = (uint)(x >> k);
-                g  = u[j & 7]
-                   ^ u[(j >> 3) & 7] << 3;
+                g  = u[(int)j & 7]
+                   ^ u[(int)(j >> 3) & 7] << 3;
                 l ^= (g <<  k);
                 h ^= (g >> -k);
             }
@@ -385,28 +571,42 @@ namespace Org.BouncyCastle.Math.EC.Custom.Sec
 
             Debug.Assert(h >> 55 == 0);
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            z[0] ^= l & M60;
+            z[1] ^= (l >> 60) ^ (h << 4);
+#else
             z[zOff    ] ^= l & M60;
             z[zOff + 1] ^= (l >> 60) ^ (h << 4);
+#endif
         }
 
-        protected static void ImplSquare(ulong[] x, ulong[] zz)
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private static void ImplSquare(ReadOnlySpan<ulong> x, Span<ulong> zz)
+#else
+        private static void ImplSquare(ulong[] x, ulong[] zz)
+#endif
         {
 #if NETCOREAPP3_0_OR_GREATER
             if (Bmi2.X64.IsSupported)
             {
-                zz[7] = Bmi2.X64.ParallelBitDeposit(x[3] >> 32, 0x5555555555555555UL);
-                zz[6] = Bmi2.X64.ParallelBitDeposit(x[3]      , 0x5555555555555555UL);
-                zz[5] = Bmi2.X64.ParallelBitDeposit(x[2] >> 32, 0x5555555555555555UL);
-                zz[4] = Bmi2.X64.ParallelBitDeposit(x[2]      , 0x5555555555555555UL);
-                zz[3] = Bmi2.X64.ParallelBitDeposit(x[1] >> 32, 0x5555555555555555UL);
-                zz[2] = Bmi2.X64.ParallelBitDeposit(x[1]      , 0x5555555555555555UL);
-                zz[1] = Bmi2.X64.ParallelBitDeposit(x[0] >> 32, 0x5555555555555555UL);
-                zz[0] = Bmi2.X64.ParallelBitDeposit(x[0]      , 0x5555555555555555UL);
+                ulong x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3];
+                zz[7] = Bmi2.X64.ParallelBitDeposit(x3 >> 32, 0x5555555555555555UL);
+                zz[6] = Bmi2.X64.ParallelBitDeposit(x3      , 0x5555555555555555UL);
+                zz[5] = Bmi2.X64.ParallelBitDeposit(x2 >> 32, 0x5555555555555555UL);
+                zz[4] = Bmi2.X64.ParallelBitDeposit(x2      , 0x5555555555555555UL);
+                zz[3] = Bmi2.X64.ParallelBitDeposit(x1 >> 32, 0x5555555555555555UL);
+                zz[2] = Bmi2.X64.ParallelBitDeposit(x1      , 0x5555555555555555UL);
+                zz[1] = Bmi2.X64.ParallelBitDeposit(x0 >> 32, 0x5555555555555555UL);
+                zz[0] = Bmi2.X64.ParallelBitDeposit(x0      , 0x5555555555555555UL);
                 return;
             }
 #endif
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+            Interleave.Expand64To128(x[..4], zz[..8]);
+#else
             Interleave.Expand64To128(x, 0, 4, zz, 0);
+#endif
         }
     }
 }