diff options
author | Peter Dettman <peter.dettman@bouncycastle.org> | 2018-04-17 10:41:26 +0700 |
---|---|---|
committer | Peter Dettman <peter.dettman@bouncycastle.org> | 2018-04-17 10:41:26 +0700 |
commit | bd1f720cafbcbf30891a286718ee91c44e1c8a4e (patch) | |
tree | 5b5a25fd08f0755ef23fb55b61162b45dfed2b42 /crypto/src/math | |
parent | Cache-safety for EC lookup tables (diff) | |
download | BouncyCastle.NET-ed25519-bd1f720cafbcbf30891a286718ee91c44e1c8a4e.tar.xz |
Add X25519 and X448 from RFC 7748
- includes optimized ladders for base points
Diffstat (limited to 'crypto/src/math')
-rw-r--r-- | crypto/src/math/ec/rfc7748/X25519.cs | 249 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc7748/X25519Field.cs | 520 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc7748/X448.cs | 255 | ||||
-rw-r--r-- | crypto/src/math/ec/rfc7748/X448Field.cs | 904 |
4 files changed, 1928 insertions, 0 deletions
diff --git a/crypto/src/math/ec/rfc7748/X25519.cs b/crypto/src/math/ec/rfc7748/X25519.cs new file mode 100644 index 000000000..862b8e0d0 --- /dev/null +++ b/crypto/src/math/ec/rfc7748/X25519.cs @@ -0,0 +1,249 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace Org.BouncyCastle.Math.EC.Rfc7748 +{ + public abstract class X25519 + { + private const int C_A = 486662; + private const int C_A24 = (C_A + 2)/4; + + // 0x1 + //private static readonly int[] S_x = new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + // 0x215132111D8354CB52385F46DCA2B71D440F6A51EB4D1207816B1E0137D48290 + private static readonly int[] PsubS_x = new int[]{ 0x03D48290, 0x02C7804D, 0x01207816, 0x028F5A68, 0x00881ED4, 0x00A2B71D, + 0x0217D1B7, 0x014CB523, 0x0088EC1A, 0x0042A264 }; + + private static int[] precompBase = null; + + private static uint Decode32(byte[] bs, int off) + { + uint n = bs[off]; + n |= (uint)bs[++off] << 8; + n |= (uint)bs[++off] << 16; + n |= (uint)bs[++off] << 24; + return n; + } + + private static void DecodeScalar(byte[] k, int kOff, uint[] n) + { + for (int i = 0; i < 8; ++i) + { + n[i] = Decode32(k, kOff + i * 4); + } + + n[0] &= 0xFFFFFFF8U; + n[7] &= 0x7FFFFFFFU; + n[7] |= 0x40000000U; + } + + private static void PointDouble(int[] x, int[] z) + { + int[] A = X25519Field.Create(); + int[] B = X25519Field.Create(); + + X25519Field.Apm(x, z, A, B); + X25519Field.Sqr(A, A); + X25519Field.Sqr(B, B); + X25519Field.Mul(A, B, x); + X25519Field.Sub(A, B, A); + X25519Field.Mul(A, C_A24, z); + X25519Field.Add(z, B, z); + X25519Field.Mul(z, A, z); + } + + [MethodImpl(MethodImplOptions.Synchronized)] + public static void Precompute() + { + if (precompBase != null) + return; + + precompBase = new int[X25519Field.Size * 252]; + + int[] xs = precompBase; + int[] zs = new int[X25519Field.Size * 251]; + + int[] x = X25519Field.Create(); x[0] = 9; + int[] z = X25519Field.Create(); z[0] = 1; + + int[] n = X25519Field.Create(); + int[] d = X25519Field.Create(); + + X25519Field.Apm(x, z, n, d); + + int[] c = X25519Field.Create(); X25519Field.Copy(d, 0, c, 0); + + int off = 0; + for (;;) + { + X25519Field.Copy(n, 0, xs, off); + + if (off == (X25519Field.Size * 251)) + break; + + PointDouble(x, z); + + X25519Field.Apm(x, z, n, d); + X25519Field.Mul(n, c, n); + X25519Field.Mul(c, d, c); + + X25519Field.Copy(d, 0, zs, off); + + off += X25519Field.Size; + } + + int[] u = X25519Field.Create(); + X25519Field.Inv(c, u); + + for (;;) + { + X25519Field.Copy(xs, off, x, 0); + + X25519Field.Mul(x, u, x); + //X25519Field.Normalize(x); + X25519Field.Copy(x, 0, precompBase, off); + + if (off == 0) + break; + + off -= X25519Field.Size; + X25519Field.Copy(zs, off, z, 0); + X25519Field.Mul(u, z, u); + } + } + + public static void ScalarMult(byte[] k, int kOff, byte[] u, int uOff, byte[] r, int rOff) + { + uint[] n = new uint[8]; DecodeScalar(k, kOff, n); + + int[] x1 = X25519Field.Create(); X25519Field.Decode(u, uOff, x1); + int[] x2 = X25519Field.Create(); X25519Field.Copy(x1, 0, x2, 0); + int[] z2 = X25519Field.Create(); z2[0] = 1; + int[] x3 = X25519Field.Create(); x3[0] = 1; + int[] z3 = X25519Field.Create(); + + int[] t1 = X25519Field.Create(); + int[] t2 = X25519Field.Create(); + + Debug.Assert(n[7] >> 30 == 1U); + + int bit = 254, swap = 1; + do + { + X25519Field.Apm(x3, z3, t1, x3); + X25519Field.Apm(x2, z2, z3, x2); + X25519Field.Mul(t1, x2, t1); + X25519Field.Mul(x3, z3, x3); + X25519Field.Sqr(z3, z3); + X25519Field.Sqr(x2, x2); + + X25519Field.Sub(z3, x2, t2); + X25519Field.Mul(t2, C_A24, z2); + X25519Field.Add(z2, x2, z2); + X25519Field.Mul(z2, t2, z2); + X25519Field.Mul(x2, z3, x2); + + X25519Field.Apm(t1, x3, x3, z3); + X25519Field.Sqr(x3, x3); + X25519Field.Sqr(z3, z3); + X25519Field.Mul(z3, x1, z3); + + --bit; + + int word = bit >> 5, shift = bit & 0x1F; + int kt = (int)(n[word] >> shift) & 1; + swap ^= kt; + X25519Field.CSwap(swap, x2, x3); + X25519Field.CSwap(swap, z2, z3); + swap = kt; + } + while (bit >= 3); + + Debug.Assert(swap == 0); + + for (int i = 0; i < 3; ++i) + { + PointDouble(x2, z2); + } + + X25519Field.Inv(z2, z2); + X25519Field.Mul(x2, z2, x2); + + X25519Field.Normalize(x2); + X25519Field.Encode(x2, r, rOff); + } + + private static void Dump(string label, int[] x) + { + int[] y = X25519Field.Create(); + X25519Field.Copy(x, 0, y, 0); + X25519Field.Normalize(y); + + byte[] r = new byte[32]; + X25519Field.Encode(y, r, 0); + string val = Org.BouncyCastle.Utilities.Encoders.Hex.ToHexString(r); + Console.WriteLine(" " + label + ": " + val); + } + + public static void ScalarMultBase(byte[] k, int kOff, byte[] r, int rOff) + { + Precompute(); + + uint[] n = new uint[8]; DecodeScalar(k, kOff, n); + + int[] x0 = X25519Field.Create(); + //int[] x1 = X25519Field.Create(); X25519Field.Copy(S_x, 0, x1, 0); + int[] x1 = X25519Field.Create(); x1[0] = 1; + int[] z1 = X25519Field.Create(); z1[0] = 1; + int[] x2 = X25519Field.Create(); X25519Field.Copy(PsubS_x, 0, x2, 0); + int[] z2 = X25519Field.Create(); z2[0] = 1; + + int[] A = x1; + int[] B = z1; + int[] C = x0; + int[] D = A; + int[] E = B; + + Debug.Assert(n[7] >> 30 == 1U); + + int off = 0, bit = 3, swap = 1; + do + { + X25519Field.Copy(precompBase, off, x0, 0); + off += X25519Field.Size; + + int word = bit >> 5, shift = bit & 0x1F; + int kt = (int)(n[word] >> shift) & 1; + swap ^= kt; + X25519Field.CSwap(swap, x1, x2); + X25519Field.CSwap(swap, z1, z2); + swap = kt; + + X25519Field.Apm(x1, z1, A, B); + X25519Field.Mul(x0, B, C); + X25519Field.Carry(A); + X25519Field.Apm(A, C, D, E); + X25519Field.Sqr(D, D); + X25519Field.Sqr(E, E); + X25519Field.Mul(z2, D, x1); + X25519Field.Mul(x2, E, z1); + } + while (++bit < 255); + + Debug.Assert(swap == 1); + + for (int i = 0; i < 3; ++i) + { + PointDouble(x1, z1); + } + + X25519Field.Inv(z1, z1); + X25519Field.Mul(x1, z1, x1); + + X25519Field.Normalize(x1); + X25519Field.Encode(x1, r, rOff); + } + } +} diff --git a/crypto/src/math/ec/rfc7748/X25519Field.cs b/crypto/src/math/ec/rfc7748/X25519Field.cs new file mode 100644 index 000000000..282f41628 --- /dev/null +++ b/crypto/src/math/ec/rfc7748/X25519Field.cs @@ -0,0 +1,520 @@ +using System; +using System.Diagnostics; + +namespace Org.BouncyCastle.Math.EC.Rfc7748 +{ + public abstract class X25519Field + { + public const int Size = 10; + + private const int M24 = 0x00FFFFFF; + private const int M25 = 0x01FFFFFF; + private const int M26 = 0x03FFFFFF; + + private X25519Field() {} + + public static void Add(int[] x, int[] y, int[] z) + { + for (int i = 0; i < Size; ++i) + { + z[i] = x[i] + y[i]; + } + } + + public static void Apm(int[] x, int[] y, int[] zp, int[] zm) + { + for (int i = 0; i < Size; ++i) + { + int xi = x[i], yi = y[i]; + zp[i] = xi + yi; + zm[i] = xi - yi; + } + } + + public static void Carry(int[] z) + { + int z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3], z4 = z[4]; + int z5 = z[5], z6 = z[6], z7 = z[7], z8 = z[8], z9 = z[9]; + + z3 += (z2 >> 25); z2 &= M25; + z5 += (z4 >> 25); z4 &= M25; + z8 += (z7 >> 25); z7 &= M25; + //z0 += (z9 >> 24) * 19; z9 &= M24; + z0 += (z9 >> 25) * 38; z9 &= M25; + + z1 += (z0 >> 26); z0 &= M26; + z6 += (z5 >> 26); z5 &= M26; + + z2 += (z1 >> 26); z1 &= M26; + z4 += (z3 >> 26); z3 &= M26; + z7 += (z6 >> 26); z6 &= M26; + z9 += (z8 >> 26); z8 &= M26; + + z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; + z[5] = z5; z[6] = z6; z[7] = z7; z[8] = z8; z[9] = z9; + } + + public static void Copy(int[] x, int xOff, int[] z, int zOff) + { + for (int i = 0; i < Size; ++i) + { + z[zOff + i] = x[xOff + i]; + } + } + + public static int[] Create() + { + return new int[Size]; + } + + public static void CSwap(int swap, int[] a, int[] b) + { + Debug.Assert(swap >> 1 == 0); + Debug.Assert(a != b); + + int mask = 0 - swap; + for (int i = 0; i < Size; ++i) + { + int ai = a[i], bi = b[i]; + int dummy = mask & (ai ^ bi); + a[i] = ai ^ dummy; + b[i] = bi ^ dummy; + } + } + + public static void Decode(byte[] x, int xOff, int[] z) + { + Decode128(x, xOff, z, 0); + Decode128(x, xOff + 16, z, 5); + z[9] &= M24; + } + + private static void Decode128(byte[] bs, int off, int[] z, int zOff) + { + uint t0 = Decode32(bs, off + 0); + uint t1 = Decode32(bs, off + 4); + uint t2 = Decode32(bs, off + 8); + uint t3 = Decode32(bs, off + 12); + + z[zOff + 0] = (int)t0 & M26; + z[zOff + 1] = (int)((t1 << 6) | (t0 >> 26)) & M26; + z[zOff + 2] = (int)((t2 << 12) | (t1 >> 20)) & M25; + z[zOff + 3] = (int)((t3 << 19) | (t2 >> 13)) & M26; + z[zOff + 4] = (int)(t3 >> 7); + } + + private static uint Decode32(byte[] bs, int off) + { + uint n = bs[off]; + n |= (uint)bs[++off] << 8; + n |= (uint)bs[++off] << 16; + n |= (uint)bs[++off] << 24; + return n; + } + + public static void Encode(int[] x, byte[] z, int zOff) + { + Encode128(x, 0, z, zOff); + Encode128(x, 5, z, zOff + 16); + } + + private static void Encode128(int[] x, int xOff, byte[] bs, int off) + { + uint x0 = (uint)x[xOff + 0], x1 = (uint)x[xOff + 1], x2 = (uint)x[xOff + 2]; + uint x3 = (uint)x[xOff + 3], x4 = (uint)x[xOff + 4]; + + uint t0 = x0 | (x1 << 26); Encode32(t0, bs, off + 0); + uint t1 = (x1 >> 6) | (x2 << 20); Encode32(t1, bs, off + 4); + uint t2 = (x2 >> 12) | (x3 << 13); Encode32(t2, bs, off + 8); + uint t3 = (x3 >> 19) | (x4 << 7); Encode32(t3, bs, off + 12); + } + + private static void Encode32(uint n, byte[] bs, int off) + { + bs[ off] = (byte)(n ); + bs[++off] = (byte)(n >> 8); + bs[++off] = (byte)(n >> 16); + bs[++off] = (byte)(n >> 24); + } + + public static void Inv(int[] x, int[] z) + { + // z = x^(p-2) = x^7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEB + // (250 1s) (1 0s) (1 1s) (1 0s) (2 1s) + // Addition chain: [1] [2] 3 5 10 15 25 50 75 125 [250] + + int[] x2 = Create(); Sqr(x, x2); Mul(x, x2, x2); + int[] x3 = Create(); Sqr(x2, x3); Mul(x, x3, x3); + int[] x5 = x3; Sqr(x3, 2, x5); Mul(x2, x5, x5); + int[] x10 = Create(); Sqr(x5, 5, x10); Mul(x5, x10, x10); + int[] x15 = Create(); Sqr(x10, 5, x15); Mul(x5, x15, x15); + int[] x25 = x5; Sqr(x15, 10, x25); Mul(x10, x25, x25); + int[] x50 = x10; Sqr(x25, 25, x50); Mul(x25, x50, x50); + int[] x75 = x15; Sqr(x50, 25, x75); Mul(x25, x75, x75); + int[] x125 = x25; Sqr(x75, 50, x125); Mul(x50, x125, x125); + int[] x250 = x50; Sqr(x125, 125, x250); Mul(x125, x250, x250); + + int[] t = x125; + Sqr(x250, 2, t); + Mul(t, x, t); + Sqr(t, 3, t); + Mul(t, x2, z); + } + + public static void Mul(int[] x, int y, int[] z) + { + int x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; + int x5 = x[5], x6 = x[6], x7 = x[7], x8 = x[8], x9 = x[9]; + long c0, c1, c2, c3; + + c0 = (long)x2 * y; x2 = (int)c0 & M25; c0 >>= 25; + c1 = (long)x4 * y; x4 = (int)c1 & M25; c1 >>= 25; + c2 = (long)x7 * y; x7 = (int)c2 & M25; c2 >>= 25; + //c3 = (long)x9 * y; x9 = (int)c3 & M24; c3 >>= 24; + //c3 *= 19; + c3 = (long)x9 * y; x9 = (int)c3 & M25; c3 >>= 25; + c3 *= 38; + + c3 += (long)x0 * y; z[0] = (int)c3 & M26; c3 >>= 26; + c1 += (long)x5 * y; z[5] = (int)c1 & M26; c1 >>= 26; + + c3 += (long)x1 * y; z[1] = (int)c3 & M26; c3 >>= 26; + c0 += (long)x3 * y; z[3] = (int)c0 & M26; c0 >>= 26; + c1 += (long)x6 * y; z[6] = (int)c1 & M26; c1 >>= 26; + c2 += (long)x8 * y; z[8] = (int)c2 & M26; c2 >>= 26; + + z[2] = x2 + (int)c3; + z[4] = x4 + (int)c0; + z[7] = x7 + (int)c1; + z[9] = x9 + (int)c2; + } + + public static void Mul(int[] x, int[] y, int[] z) + { + int x0 = x[0], y0 = y[0]; + int x1 = x[1], y1 = y[1]; + int x2 = x[2], y2 = y[2]; + int x3 = x[3], y3 = y[3]; + int x4 = x[4], y4 = y[4]; + + int u0 = x[5], v0 = y[5]; + int u1 = x[6], v1 = y[6]; + int u2 = x[7], v2 = y[7]; + int u3 = x[8], v3 = y[8]; + int u4 = x[9], v4 = y[9]; + + long a0 = (long)x0 * y0; + long a1 = (long)x0 * y1 + + (long)x1 * y0; + long a2 = (long)x0 * y2 + + (long)x1 * y1 + + (long)x2 * y0; + long a3 = (long)x1 * y2 + + (long)x2 * y1; + a3 <<= 1; + a3 += (long)x0 * y3 + + (long)x3 * y0; + long a4 = (long)x2 * y2; + a4 <<= 1; + a4 += (long)x0 * y4 + + (long)x1 * y3 + + (long)x3 * y1 + + (long)x4 * y0; + long a5 = (long)x1 * y4 + + (long)x2 * y3 + + (long)x3 * y2 + + (long)x4 * y1; + a5 <<= 1; + long a6 = (long)x2 * y4 + + (long)x4 * y2; + a6 <<= 1; + a6 += (long)x3 * y3; + long a7 = (long)x3 * y4 + + (long)x4 * y3; + long a8 = (long)x4 * y4; + a8 <<= 1; + + long b0 = (long)u0 * v0; + long b1 = (long)u0 * v1 + + (long)u1 * v0; + long b2 = (long)u0 * v2 + + (long)u1 * v1 + + (long)u2 * v0; + long b3 = (long)u1 * v2 + + (long)u2 * v1; + b3 <<= 1; + b3 += (long)u0 * v3 + + (long)u3 * v0; + long b4 = (long)u2 * v2; + b4 <<= 1; + b4 += (long)u0 * v4 + + (long)u1 * v3 + + (long)u3 * v1 + + (long)u4 * v0; + long b5 = (long)u1 * v4 + + (long)u2 * v3 + + (long)u3 * v2 + + (long)u4 * v1; + //b5 <<= 1; + long b6 = (long)u2 * v4 + + (long)u4 * v2; + b6 <<= 1; + b6 += (long)u3 * v3; + long b7 = (long)u3 * v4 + + (long)u4 * v3; + long b8 = (long)u4 * v4; + //b8 <<= 1; + + a0 -= b5 * 76; + a1 -= b6 * 38; + a2 -= b7 * 38; + a3 -= b8 * 76; + + a5 -= b0; + a6 -= b1; + a7 -= b2; + a8 -= b3; + //long a9 = -b4; + + x0 += u0; y0 += v0; + x1 += u1; y1 += v1; + x2 += u2; y2 += v2; + x3 += u3; y3 += v3; + x4 += u4; y4 += v4; + + long c0 = (long)x0 * y0; + long c1 = (long)x0 * y1 + + (long)x1 * y0; + long c2 = (long)x0 * y2 + + (long)x1 * y1 + + (long)x2 * y0; + long c3 = (long)x1 * y2 + + (long)x2 * y1; + c3 <<= 1; + c3 += (long)x0 * y3 + + (long)x3 * y0; + long c4 = (long)x2 * y2; + c4 <<= 1; + c4 += (long)x0 * y4 + + (long)x1 * y3 + + (long)x3 * y1 + + (long)x4 * y0; + long c5 = (long)x1 * y4 + + (long)x2 * y3 + + (long)x3 * y2 + + (long)x4 * y1; + c5 <<= 1; + long c6 = (long)x2 * y4 + + (long)x4 * y2; + c6 <<= 1; + c6 += (long)x3 * y3; + long c7 = (long)x3 * y4 + + (long)x4 * y3; + long c8 = (long)x4 * y4; + c8 <<= 1; + + int z8, z9; + long t; + + t = a8 + (c3 - a3); + z8 = (int)t & M26; t >>= 26; + //t += a9 + (c4 - a4); + t += (c4 - a4) - b4; + //z9 = (int)t & M24; t >>= 24; + //t = a0 + (t + ((c5 - a5) << 1)) * 19; + z9 = (int)t & M25; t >>= 25; + t = a0 + (t + c5 - a5) * 38; + z[0] = (int)t & M26; t >>= 26; + t += a1 + (c6 - a6) * 38; + z[1] = (int)t & M26; t >>= 26; + t += a2 + (c7 - a7) * 38; + z[2] = (int)t & M25; t >>= 25; + t += a3 + (c8 - a8) * 38; + z[3] = (int)t & M26; t >>= 26; + //t += a4 - a9 * 38; + t += a4 + b4 * 38; + z[4] = (int)t & M25; t >>= 25; + t += a5 + (c0 - a0); + z[5] = (int)t & M26; t >>= 26; + t += a6 + (c1 - a1); + z[6] = (int)t & M26; t >>= 26; + t += a7 + (c2 - a2); + z[7] = (int)t & M25; t >>= 25; + t += z8; + z[8] = (int)t & M26; t >>= 26; + z[9] = z9 + (int)t; + } + + public static void Normalize(int[] z) + { + int x = (z[9] >> 23) & 1; + Reduce(z, x); + Reduce(z, -x); + Debug.Assert(z[9] >> 24 == 0); + } + + private static void Reduce(int[] z, int c) + { + int z9 = z[9], t = z9; + z9 = t & M24; t >>= 24; + t += c; + t *= 19; + t += z[0]; z[0] = t & M26; t >>= 26; + t += z[1]; z[1] = t & M26; t >>= 26; + t += z[2]; z[2] = t & M25; t >>= 25; + t += z[3]; z[3] = t & M26; t >>= 26; + t += z[4]; z[4] = t & M25; t >>= 25; + t += z[5]; z[5] = t & M26; t >>= 26; + t += z[6]; z[6] = t & M26; t >>= 26; + t += z[7]; z[7] = t & M25; t >>= 25; + t += z[8]; z[8] = t & M26; t >>= 26; + t += z9; z[9] = t; + } + + public static void Sqr(int[] x, int[] z) + { + int x0 = x[0]; + int x1 = x[1]; + int x2 = x[2]; + int x3 = x[3]; + int x4 = x[4]; + + int u0 = x[5]; + int u1 = x[6]; + int u2 = x[7]; + int u3 = x[8]; + int u4 = x[9]; + + int x1_2 = x1 * 2; + int x2_2 = x2 * 2; + int x3_2 = x3 * 2; + int x4_2 = x4 * 2; + + long a0 = (long)x0 * x0; + long a1 = (long)x0 * x1_2; + long a2 = (long)x0 * x2_2 + + (long)x1 * x1; + long a3 = (long)x1_2 * x2_2 + + (long)x0 * x3_2; + long a4 = (long)x2 * x2_2 + + (long)x0 * x4_2 + + (long)x1 * x3_2; + long a5 = (long)x1_2 * x4_2 + + (long)x2_2 * x3_2; + long a6 = (long)x2_2 * x4_2 + + (long)x3 * x3; + long a7 = (long)x3 * x4_2; + long a8 = (long)x4 * x4_2; + + int u1_2 = u1 * 2; + int u2_2 = u2 * 2; + int u3_2 = u3 * 2; + int u4_2 = u4 * 2; + + long b0 = (long)u0 * u0; + long b1 = (long)u0 * u1_2; + long b2 = (long)u0 * u2_2 + + (long)u1 * u1; + long b3 = (long)u1_2 * u2_2 + + (long)u0 * u3_2; + long b4 = (long)u2 * u2_2 + + (long)u0 * u4_2 + + (long)u1 * u3_2; + long b5 = (long)u1_2 * u4_2 + + (long)u2_2 * u3_2; + long b6 = (long)u2_2 * u4_2 + + (long)u3 * u3; + long b7 = (long)u3 * u4_2; + long b8 = (long)u4 * u4_2; + + a0 -= b5 * 38; + a1 -= b6 * 38; + a2 -= b7 * 38; + a3 -= b8 * 38; + + a5 -= b0; + a6 -= b1; + a7 -= b2; + a8 -= b3; + //long a9 = -b4; + + x0 += u0; + x1 += u1; + x2 += u2; + x3 += u3; + x4 += u4; + + x1_2 = x1 * 2; + x2_2 = x2 * 2; + x3_2 = x3 * 2; + x4_2 = x4 * 2; + + long c0 = (long)x0 * x0; + long c1 = (long)x0 * x1_2; + long c2 = (long)x0 * x2_2 + + (long)x1 * x1; + long c3 = (long)x1_2 * x2_2 + + (long)x0 * x3_2; + long c4 = (long)x2 * x2_2 + + (long)x0 * x4_2 + + (long)x1 * x3_2; + long c5 = (long)x1_2 * x4_2 + + (long)x2_2 * x3_2; + long c6 = (long)x2_2 * x4_2 + + (long)x3 * x3; + long c7 = (long)x3 * x4_2; + long c8 = (long)x4 * x4_2; + + int z8, z9; + long t; + + t = a8 + (c3 - a3); + z8 = (int)t & M26; t >>= 26; + //t += a9 + (c4 - a4); + t += (c4 - a4) - b4; + //z9 = (int)t & M24; t >>= 24; + //t = a0 + (t + ((c5 - a5) << 1)) * 19; + z9 = (int)t & M25; t >>= 25; + t = a0 + (t + c5 - a5) * 38; + z[0] = (int)t & M26; t >>= 26; + t += a1 + (c6 - a6) * 38; + z[1] = (int)t & M26; t >>= 26; + t += a2 + (c7 - a7) * 38; + z[2] = (int)t & M25; t >>= 25; + t += a3 + (c8 - a8) * 38; + z[3] = (int)t & M26; t >>= 26; + //t += a4 - a9 * 38; + t += a4 + b4 * 38; + z[4] = (int)t & M25; t >>= 25; + t += a5 + (c0 - a0); + z[5] = (int)t & M26; t >>= 26; + t += a6 + (c1 - a1); + z[6] = (int)t & M26; t >>= 26; + t += a7 + (c2 - a2); + z[7] = (int)t & M25; t >>= 25; + t += z8; + z[8] = (int)t & M26; t >>= 26; + z[9] = z9 + (int)t; + } + + public static void Sqr(int[] x, int n, int[] z) + { + Debug.Assert(n > 0); + + Sqr(x, z); + + while (--n > 0) + { + Sqr(z, z); + } + } + + public static void Sub(int[] x, int[] y, int[] z) + { + for (int i = 0; i < Size; ++i) + { + z[i] = x[i] - y[i]; + } + } + } +} diff --git a/crypto/src/math/ec/rfc7748/X448.cs b/crypto/src/math/ec/rfc7748/X448.cs new file mode 100644 index 000000000..32a4a9e2a --- /dev/null +++ b/crypto/src/math/ec/rfc7748/X448.cs @@ -0,0 +1,255 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace Org.BouncyCastle.Math.EC.Rfc7748 +{ + public abstract class X448 + { + private const uint C_A = 156326; + private const uint C_A24 = (C_A + 2)/4; + + // 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE + private static readonly uint[] S_x = new uint[]{ 0x0FFFFFFEU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, + 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFEU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, 0x0FFFFFFFU, + 0x0FFFFFFFU }; + + // 0xF0FAB725013244423ACF03881AFFEB7BDACDD1031C81B9672954459D84C1F823F1BD65643ACE1B5123AC33FF1C69BAF8ACB1197DC99D2720 + private static readonly uint[] PsubS_x = new uint[]{ 0x099d2720U, 0x0b1197dcU, 0x09baf8acU, 0x033ff1c6U, 0x0b5123acU, + 0x0643ace1U, 0x03f1bd65U, 0x084c1f82U, 0x0954459dU, 0x081b9672U, 0x0dd1031cU, 0x0eb7bdacU, 0x03881affU, 0x0423acf0U, + 0x05013244U, 0x0f0fab72U }; + + private static uint[] precompBase = null; + + private static uint Decode32(byte[] bs, int off) + { + uint n = bs[off]; + n |= (uint)bs[++off] << 8; + n |= (uint)bs[++off] << 16; + n |= (uint)bs[++off] << 24; + return n; + } + + private static void DecodeScalar(byte[] k, int kOff, uint[] n) + { + for (int i = 0; i < 14; ++i) + { + n[i] = Decode32(k, kOff + i * 4); + } + + n[ 0] &= 0xFFFFFFFCU; + n[13] |= 0x80000000U; + } + + private static void PointDouble(uint[] x, uint[] z) + { + uint[] A = X448Field.Create(); + uint[] B = X448Field.Create(); + + //X448Field.Apm(x, z, A, B); + X448Field.Add(x, z, A); + X448Field.Sub(x, z, B); + X448Field.Sqr(A, A); + X448Field.Sqr(B, B); + X448Field.Mul(A, B, x); + X448Field.Sub(A, B, A); + X448Field.Mul(A, C_A24, z); + X448Field.Add(z, B, z); + X448Field.Mul(z, A, z); + } + + [MethodImpl(MethodImplOptions.Synchronized)] + public static void Precompute() + { + if (precompBase != null) + return; + + precompBase = new uint[X448Field.Size * 446]; + + uint[] xs = precompBase; + uint[] zs = new uint[X448Field.Size * 445]; + + uint[] x = X448Field.Create(); x[0] = 5; + uint[] z = X448Field.Create(); z[0] = 1; + + uint[] n = X448Field.Create(); + uint[] d = X448Field.Create(); + + //X448Field.Apm(x, z, n, d); + X448Field.Add(x, z, n); + X448Field.Sub(x, z, d); + + uint[] c = X448Field.Create(); X448Field.Copy(d, 0, c, 0); + + int off = 0; + for (;;) + { + X448Field.Copy(n, 0, xs, off); + + if (off == (X448Field.Size * 445)) + break; + + PointDouble(x, z); + + //X448Field.Apm(x, z, n, d); + X448Field.Add(x, z, n); + X448Field.Sub(x, z, d); + X448Field.Mul(n, c, n); + X448Field.Mul(c, d, c); + + X448Field.Copy(d, 0, zs, off); + + off += X448Field.Size; + } + + uint[] u = X448Field.Create(); + X448Field.Inv(c, u); + + for (;;) + { + X448Field.Copy(xs, off, x, 0); + + X448Field.Mul(x, u, x); + //X448Field.Normalize(x); + X448Field.Copy(x, 0, precompBase, off); + + if (off == 0) + break; + + off -= X448Field.Size; + X448Field.Copy(zs, off, z, 0); + X448Field.Mul(u, z, u); + } + } + + public static void ScalarMult(byte[] k, int kOff, byte[] u, int uOff, byte[] r, int rOff) + { + uint[] n = new uint[14]; DecodeScalar(k, kOff, n); + + uint[] x1 = X448Field.Create(); X448Field.Decode(u, uOff, x1); + uint[] x2 = X448Field.Create(); X448Field.Copy(x1, 0, x2, 0); + uint[] z2 = X448Field.Create(); z2[0] = 1; + uint[] x3 = X448Field.Create(); x3[0] = 1; + uint[] z3 = X448Field.Create(); + + uint[] t1 = X448Field.Create(); + uint[] t2 = X448Field.Create(); + + Debug.Assert(n[13] >> 31 == 1U); + + int bit = 447, swap = 1; + do + { + //X448Field.Apm(x3, z3, t1, x3); + X448Field.Add(x3, z3, t1); + X448Field.Sub(x3, z3, x3); + //X448Field.Apm(x2, z2, z3, x2); + X448Field.Add(x2, z2, z3); + X448Field.Sub(x2, z2, x2); + + X448Field.Mul(t1, x2, t1); + X448Field.Mul(x3, z3, x3); + X448Field.Sqr(z3, z3); + X448Field.Sqr(x2, x2); + + X448Field.Sub(z3, x2, t2); + X448Field.Mul(t2, C_A24, z2); + X448Field.Add(z2, x2, z2); + X448Field.Mul(z2, t2, z2); + X448Field.Mul(x2, z3, x2); + + //X448Field.Apm(t1, x3, x3, z3); + X448Field.Sub(t1, x3, z3); + X448Field.Add(t1, x3, x3); + X448Field.Sqr(x3, x3); + X448Field.Sqr(z3, z3); + X448Field.Mul(z3, x1, z3); + + --bit; + + int word = bit >> 5, shift = bit & 0x1F; + int kt = (int)(n[word] >> shift) & 1; + swap ^= kt; + X448Field.CSwap(swap, x2, x3); + X448Field.CSwap(swap, z2, z3); + swap = kt; + } + while (bit >= 2); + + Debug.Assert(swap == 0); + + for (int i = 0; i < 2; ++i) + { + PointDouble(x2, z2); + } + + X448Field.Inv(z2, z2); + X448Field.Mul(x2, z2, x2); + + X448Field.Normalize(x2); + X448Field.Encode(x2, r, rOff); + } + + public static void ScalarMultBase(byte[] k, int kOff, byte[] r, int rOff) + { + Precompute(); + + uint[] n = new uint[14]; DecodeScalar(k, kOff, n); + + uint[] x0 = X448Field.Create(); + uint[] x1 = X448Field.Create(); X448Field.Copy(S_x, 0, x1, 0); + uint[] z1 = X448Field.Create(); z1[0] = 1; + uint[] x2 = X448Field.Create(); X448Field.Copy(PsubS_x, 0, x2, 0); + uint[] z2 = X448Field.Create(); z2[0] = 1; + + uint[] A = X448Field.Create(); + uint[] B = z1; + uint[] C = x0; + uint[] D = x1; + uint[] E = B; + + Debug.Assert(n[13] >> 31 == 1U); + + int off = 0, bit = 2, swap = 1; + do + { + X448Field.Copy(precompBase, off, x0, 0); + off += X448Field.Size; + + int word = bit >> 5, shift = bit & 0x1F; + int kt = (int)(n[word] >> shift) & 1; + swap ^= kt; + X448Field.CSwap(swap, x1, x2); + X448Field.CSwap(swap, z1, z2); + swap = kt; + + //X448Field.Apm(x1, z1, A, B); + X448Field.Add(x1, z1, A); + X448Field.Sub(x1, z1, B); + X448Field.Mul(x0, B, C); + X448Field.Carry(A); + //X448Field.Apm(A, C, D, E); + X448Field.Add(A, C, D); + X448Field.Sub(A, C, E); + X448Field.Sqr(D, D); + X448Field.Sqr(E, E); + X448Field.Mul(z2, D, x1); + X448Field.Mul(x2, E, z1); + } + while (++bit < 448); + + Debug.Assert(swap == 1); + + for (int i = 0; i < 2; ++i) + { + PointDouble(x1, z1); + } + + X448Field.Inv(z1, z1); + X448Field.Mul(x1, z1, x1); + + X448Field.Normalize(x1); + X448Field.Encode(x1, r, rOff); + } + } +} diff --git a/crypto/src/math/ec/rfc7748/X448Field.cs b/crypto/src/math/ec/rfc7748/X448Field.cs new file mode 100644 index 000000000..0c44f1eb5 --- /dev/null +++ b/crypto/src/math/ec/rfc7748/X448Field.cs @@ -0,0 +1,904 @@ +using System; +using System.Diagnostics; + +namespace Org.BouncyCastle.Math.EC.Rfc7748 +{ + [CLSCompliantAttribute(false)] + public abstract class X448Field + { + public const int Size = 16; + + private const uint M28 = 0x0FFFFFFFU; + + private X448Field() {} + + public static void Add(uint[] x, uint[] y, uint[] z) + { + for (int i = 0; i < Size; ++i) + { + z[i] = x[i] + y[i]; + } + } + + //public static void Apm(int[] x, int[] y, int[] zp, int[] zm) + //{ + // for (int i = 0; i < Size; ++i) + // { + // int xi = x[i], yi = y[i]; + // zp[i] = xi + yi; + // zm[i] = xi - yi; + // } + //} + + public static void Carry(uint[] z) + { + uint z0 = z[0], z1 = z[1], z2 = z[2], z3 = z[3], z4 = z[4], z5 = z[5], z6 = z[6], z7 = z[7]; + uint z8 = z[8], z9 = z[9], z10 = z[10], z11 = z[11], z12 = z[12], z13 = z[13], z14 = z[14], z15 = z[15]; + + z2 += (z1 >> 28); z1 &= M28; + z6 += (z5 >> 28); z5 &= M28; + z10 += (z9 >> 28); z9 &= M28; + z14 += (z13 >> 28); z13 &= M28; + + z3 += (z2 >> 28); z2 &= M28; + z7 += (z6 >> 28); z6 &= M28; + z11 += (z10 >> 28); z10 &= M28; + z15 += (z14 >> 28); z14 &= M28; + + uint t = z15 >> 28; z15 &= M28; + z0 += t; + z8 += t; + + z4 += (z3 >> 28); z3 &= M28; + z8 += (z7 >> 28); z7 &= M28; + z12 += (z11 >> 28); z11 &= M28; + + z1 += (z0 >> 28); z0 &= M28; + z5 += (z4 >> 28); z4 &= M28; + z9 += (z8 >> 28); z8 &= M28; + z13 += (z12 >> 28); z12 &= M28; + + z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; z[5] = z5; z[6] = z6; z[7] = z7; + z[8] = z8; z[9] = z9; z[10] = z10; z[11] = z11; z[12] = z12; z[13] = z13; z[14] = z14; z[15] = z15; + } + + public static void Copy(uint[] x, int xOff, uint[] z, int zOff) + { + for (int i = 0; i < Size; ++i) + { + z[zOff + i] = x[xOff + i]; + } + } + + public static uint[] Create() + { + return new uint[Size]; + } + + public static void CSwap(int swap, uint[] a, uint[] b) + { + Debug.Assert(swap >> 1 == 0); + Debug.Assert(a != b); + + uint mask = (uint)(0 - swap); + for (int i = 0; i < Size; ++i) + { + uint ai = a[i], bi = b[i]; + uint dummy = mask & (ai ^ bi); + a[i] = ai ^ dummy; + b[i] = bi ^ dummy; + } + } + + public static void Decode(byte[] x, int xOff, uint[] z) + { + Decode56(x, xOff, z, 0); + Decode56(x, xOff + 7, z, 2); + Decode56(x, xOff + 14, z, 4); + Decode56(x, xOff + 21, z, 6); + Decode56(x, xOff + 28, z, 8); + Decode56(x, xOff + 35, z, 10); + Decode56(x, xOff + 42, z, 12); + Decode56(x, xOff + 49, z, 14); + } + + private static uint Decode24(byte[] bs, int off) + { + uint n = bs[off]; + n |= (uint)bs[++off] << 8; + n |= (uint)bs[++off] << 16; + return n; + } + + private static uint Decode32(byte[] bs, int off) + { + uint n = bs[off]; + n |= (uint)bs[++off] << 8; + n |= (uint)bs[++off] << 16; + n |= (uint)bs[++off] << 24; + return n; + } + + private static void Decode56(byte[] bs, int off, uint[] z, int zOff) + { + uint lo = Decode32(bs, off); + uint hi = Decode24(bs, off + 4); + z[zOff] = lo & M28; + z[zOff + 1] = (lo >> 28) | (hi << 4); + } + + public static void Encode(uint[] x, byte[] z, int zOff) + { + Encode56(x, 0, z, zOff); + Encode56(x, 2, z, zOff + 7); + Encode56(x, 4, z, zOff + 14); + Encode56(x, 6, z, zOff + 21); + Encode56(x, 8, z, zOff + 28); + Encode56(x, 10, z, zOff + 35); + Encode56(x, 12, z, zOff + 42); + Encode56(x, 14, z, zOff + 49); + } + + private static void Encode24(uint n, byte[] bs, int off) + { + bs[ off] = (byte)(n ); + bs[++off] = (byte)(n >> 8); + bs[++off] = (byte)(n >> 16); + } + + private static void Encode32(uint n, byte[] bs, int off) + { + bs[ off] = (byte)(n ); + bs[++off] = (byte)(n >> 8); + bs[++off] = (byte)(n >> 16); + bs[++off] = (byte)(n >> 24); + } + + private static void Encode56(uint[] x, int xOff, byte[] bs, int off) + { + uint lo = x[xOff], hi = x[xOff + 1]; + Encode32(lo | (hi << 28), bs, off); + Encode24(hi >> 4, bs, off + 4); + } + + public static void Inv(uint[] x, uint[] z) + { + // z = x^(p-2) = x^(2^448 - 2^224 - 3) + // (223 1s) (1 0s) (222 1s) (1 0s) (1 1s) + // Addition chain: [1] 2 3 6 9 18 19 37 74 111 [222] [223] + uint[] x2 = Create(); Sqr(x, x2); Mul(x, x2, x2); + uint[] x3 = Create(); Sqr(x2, x3); Mul(x, x3, x3); + uint[] x6 = Create(); Sqr(x3, 3, x6); Mul(x3, x6, x6); + uint[] x9 = Create(); Sqr(x6, 3, x9); Mul(x3, x9, x9); + uint[] x18 = Create(); Sqr(x9, 9, x18); Mul(x9, x18, x18); + uint[] x19 = Create(); Sqr(x18, x19); Mul(x, x19, x19); + uint[] x37 = Create(); Sqr(x19, 18, x37); Mul(x18, x37, x37); + uint[] x74 = Create(); Sqr(x37, 37, x74); Mul(x37, x74, x74); + uint[] x111 = Create(); Sqr(x74, 37, x111); Mul(x37, x111, x111); + uint[] x222 = Create(); Sqr(x111, 111, x222); Mul(x111, x222, x222); + uint[] x223 = Create(); Sqr(x222, x223); Mul(x, x223, x223); + + uint[] t = Create(); + Sqr(x223, 223, t); + Mul(t, x222, t); + Sqr(t, 2, t); + Mul(t, x, z); + } + + public static void Mul(uint[] x, uint y, uint[] z) + { + uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7]; + uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15]; + + uint z1, z5, z9, z13; + ulong c, d, e, f; + + c = (ulong)x1 * y; + z1 = (uint)c & M28; c >>= 28; + d = (ulong)x5 * y; + z5 = (uint)d & M28; d >>= 28; + e = (ulong)x9 * y; + z9 = (uint)e & M28; e >>= 28; + f = (ulong)x13 * y; + z13 = (uint)f & M28; f >>= 28; + + c += (ulong)x2 * y; + z[2] = (uint)c & M28; c >>= 28; + d += (ulong)x6 * y; + z[6] = (uint)d & M28; d >>= 28; + e += (ulong)x10 * y; + z[10] = (uint)e & M28; e >>= 28; + f += (ulong)x14 * y; + z[14] = (uint)f & M28; f >>= 28; + + c += (ulong)x3 * y; + z[3] = (uint)c & M28; c >>= 28; + d += (ulong)x7 * y; + z[7] = (uint)d & M28; d >>= 28; + e += (ulong)x11 * y; + z[11] = (uint)e & M28; e >>= 28; + f += (ulong)x15 * y; + z[15] = (uint)f & M28; f >>= 28; + + d += f; + + c += (ulong)x4 * y; + z[4] = (uint)c & M28; c >>= 28; + d += (ulong)x8 * y; + z[8] = (uint)d & M28; d >>= 28; + e += (ulong)x12 * y; + z[12] = (uint)e & M28; e >>= 28; + f += (ulong)x0 * y; + z[0] = (uint)f & M28; f >>= 28; + + z[1] = z1 + (uint)f; + z[5] = z5 + (uint)c; + z[9] = z9 + (uint)d; + z[13] = z13 + (uint)e; + } + + public static void Mul(uint[] x, uint[] y, uint[] z) + { + uint x0 = x[0]; + uint x1 = x[1]; + uint x2 = x[2]; + uint x3 = x[3]; + uint x4 = x[4]; + uint x5 = x[5]; + uint x6 = x[6]; + uint x7 = x[7]; + + uint u0 = x[8]; + uint u1 = x[9]; + uint u2 = x[10]; + uint u3 = x[11]; + uint u4 = x[12]; + uint u5 = x[13]; + uint u6 = x[14]; + uint u7 = x[15]; + + uint y0 = y[0]; + uint y1 = y[1]; + uint y2 = y[2]; + uint y3 = y[3]; + uint y4 = y[4]; + uint y5 = y[5]; + uint y6 = y[6]; + uint y7 = y[7]; + + uint v0 = y[8]; + uint v1 = y[9]; + uint v2 = y[10]; + uint v3 = y[11]; + uint v4 = y[12]; + uint v5 = y[13]; + uint v6 = y[14]; + uint v7 = y[15]; + + uint s0 = x0 + u0; + uint s1 = x1 + u1; + uint s2 = x2 + u2; + uint s3 = x3 + u3; + uint s4 = x4 + u4; + uint s5 = x5 + u5; + uint s6 = x6 + u6; + uint s7 = x7 + u7; + + uint t0 = y0 + v0; + uint t1 = y1 + v1; + uint t2 = y2 + v2; + uint t3 = y3 + v3; + uint t4 = y4 + v4; + uint t5 = y5 + v5; + uint t6 = y6 + v6; + uint t7 = y7 + v7; + + uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15; + ulong c, d; + + ulong f0 = (ulong)x0 * y0; + ulong f8 = (ulong)x7 * y1 + + (ulong)x6 * y2 + + (ulong)x5 * y3 + + (ulong)x4 * y4 + + (ulong)x3 * y5 + + (ulong)x2 * y6 + + (ulong)x1 * y7; + ulong g0 = (ulong)u0 * v0; + ulong g8 = (ulong)u7 * v1 + + (ulong)u6 * v2 + + (ulong)u5 * v3 + + (ulong)u4 * v4 + + (ulong)u3 * v5 + + (ulong)u2 * v6 + + (ulong)u1 * v7; + ulong h0 = (ulong)s0 * t0; + ulong h8 = (ulong)s7 * t1 + + (ulong)s6 * t2 + + (ulong)s5 * t3 + + (ulong)s4 * t4 + + (ulong)s3 * t5 + + (ulong)s2 * t6 + + (ulong)s1 * t7; + + c = f0 + g0 + h8 - f8; + z0 = (uint)c & M28; c >>= 28; + d = g8 + h0 - f0 + h8; + z8 = (uint)d & M28; d >>= 28; + + ulong f1 = (ulong)x1 * y0 + + (ulong)x0 * y1; + ulong f9 = (ulong)x7 * y2 + + (ulong)x6 * y3 + + (ulong)x5 * y4 + + (ulong)x4 * y5 + + (ulong)x3 * y6 + + (ulong)x2 * y7; + ulong g1 = (ulong)u1 * v0 + + (ulong)u0 * v1; + ulong g9 = (ulong)u7 * v2 + + (ulong)u6 * v3 + + (ulong)u5 * v4 + + (ulong)u4 * v5 + + (ulong)u3 * v6 + + (ulong)u2 * v7; + ulong h1 = (ulong)s1 * t0 + + (ulong)s0 * t1; + ulong h9 = (ulong)s7 * t2 + + (ulong)s6 * t3 + + (ulong)s5 * t4 + + (ulong)s4 * t5 + + (ulong)s3 * t6 + + (ulong)s2 * t7; + + c += f1 + g1 + h9 - f9; + z1 = (uint)c & M28; c >>= 28; + d += g9 + h1 - f1 + h9; + z9 = (uint)d & M28; d >>= 28; + + ulong f2 = (ulong)x2 * y0 + + (ulong)x1 * y1 + + (ulong)x0 * y2; + ulong f10 = (ulong)x7 * y3 + + (ulong)x6 * y4 + + (ulong)x5 * y5 + + (ulong)x4 * y6 + + (ulong)x3 * y7; + ulong g2 = (ulong)u2 * v0 + + (ulong)u1 * v1 + + (ulong)u0 * v2; + ulong g10 = (ulong)u7 * v3 + + (ulong)u6 * v4 + + (ulong)u5 * v5 + + (ulong)u4 * v6 + + (ulong)u3 * v7; + ulong h2 = (ulong)s2 * t0 + + (ulong)s1 * t1 + + (ulong)s0 * t2; + ulong h10 = (ulong)s7 * t3 + + (ulong)s6 * t4 + + (ulong)s5 * t5 + + (ulong)s4 * t6 + + (ulong)s3 * t7; + + c += f2 + g2 + h10 - f10; + z2 = (uint)c & M28; c >>= 28; + d += g10 + h2 - f2 + h10; + z10 = (uint)d & M28; d >>= 28; + + ulong f3 = (ulong)x3 * y0 + + (ulong)x2 * y1 + + (ulong)x1 * y2 + + (ulong)x0 * y3; + ulong f11 = (ulong)x7 * y4 + + (ulong)x6 * y5 + + (ulong)x5 * y6 + + (ulong)x4 * y7; + ulong g3 = (ulong)u3 * v0 + + (ulong)u2 * v1 + + (ulong)u1 * v2 + + (ulong)u0 * v3; + ulong g11 = (ulong)u7 * v4 + + (ulong)u6 * v5 + + (ulong)u5 * v6 + + (ulong)u4 * v7; + ulong h3 = (ulong)s3 * t0 + + (ulong)s2 * t1 + + (ulong)s1 * t2 + + (ulong)s0 * t3; + ulong h11 = (ulong)s7 * t4 + + (ulong)s6 * t5 + + (ulong)s5 * t6 + + (ulong)s4 * t7; + + c += f3 + g3 + h11 - f11; + z3 = (uint)c & M28; c >>= 28; + d += g11 + h3 - f3 + h11; + z11 = (uint)d & M28; d >>= 28; + + ulong f4 = (ulong)x4 * y0 + + (ulong)x3 * y1 + + (ulong)x2 * y2 + + (ulong)x1 * y3 + + (ulong)x0 * y4; + ulong f12 = (ulong)x7 * y5 + + (ulong)x6 * y6 + + (ulong)x5 * y7; + ulong g4 = (ulong)u4 * v0 + + (ulong)u3 * v1 + + (ulong)u2 * v2 + + (ulong)u1 * v3 + + (ulong)u0 * v4; + ulong g12 = (ulong)u7 * v5 + + (ulong)u6 * v6 + + (ulong)u5 * v7; + ulong h4 = (ulong)s4 * t0 + + (ulong)s3 * t1 + + (ulong)s2 * t2 + + (ulong)s1 * t3 + + (ulong)s0 * t4; + ulong h12 = (ulong)s7 * t5 + + (ulong)s6 * t6 + + (ulong)s5 * t7; + + c += f4 + g4 + h12 - f12; + z4 = (uint)c & M28; c >>= 28; + d += g12 + h4 - f4 + h12; + z12 = (uint)d & M28; d >>= 28; + + ulong f5 = (ulong)x5 * y0 + + (ulong)x4 * y1 + + (ulong)x3 * y2 + + (ulong)x2 * y3 + + (ulong)x1 * y4 + + (ulong)x0 * y5; + ulong f13 = (ulong)x7 * y6 + + (ulong)x6 * y7; + ulong g5 = (ulong)u5 * v0 + + (ulong)u4 * v1 + + (ulong)u3 * v2 + + (ulong)u2 * v3 + + (ulong)u1 * v4 + + (ulong)u0 * v5; + ulong g13 = (ulong)u7 * v6 + + (ulong)u6 * v7; + ulong h5 = (ulong)s5 * t0 + + (ulong)s4 * t1 + + (ulong)s3 * t2 + + (ulong)s2 * t3 + + (ulong)s1 * t4 + + (ulong)s0 * t5; + ulong h13 = (ulong)s7 * t6 + + (ulong)s6 * t7; + + c += f5 + g5 + h13 - f13; + z5 = (uint)c & M28; c >>= 28; + d += g13 + h5 - f5 + h13; + z13 = (uint)d & M28; d >>= 28; + + ulong f6 = (ulong)x6 * y0 + + (ulong)x5 * y1 + + (ulong)x4 * y2 + + (ulong)x3 * y3 + + (ulong)x2 * y4 + + (ulong)x1 * y5 + + (ulong)x0 * y6; + ulong f14 = (ulong)x7 * y7; + ulong g6 = (ulong)u6 * v0 + + (ulong)u5 * v1 + + (ulong)u4 * v2 + + (ulong)u3 * v3 + + (ulong)u2 * v4 + + (ulong)u1 * v5 + + (ulong)u0 * v6; + ulong g14 = (ulong)u7 * v7; + ulong h6 = (ulong)s6 * t0 + + (ulong)s5 * t1 + + (ulong)s4 * t2 + + (ulong)s3 * t3 + + (ulong)s2 * t4 + + (ulong)s1 * t5 + + (ulong)s0 * t6; + ulong h14 = (ulong)s7 * t7; + + c += f6 + g6 + h14 - f14; + z6 = (uint)c & M28; c >>= 28; + d += g14 + h6 - f6 + h14; + z14 = (uint)d & M28; d >>= 28; + + ulong f7 = (ulong)x7 * y0 + + (ulong)x6 * y1 + + (ulong)x5 * y2 + + (ulong)x4 * y3 + + (ulong)x3 * y4 + + (ulong)x2 * y5 + + (ulong)x1 * y6 + + (ulong)x0 * y7; + ulong g7 = (ulong)u7 * v0 + + (ulong)u6 * v1 + + (ulong)u5 * v2 + + (ulong)u4 * v3 + + (ulong)u3 * v4 + + (ulong)u2 * v5 + + (ulong)u1 * v6 + + (ulong)u0 * v7; + ulong h7 = (ulong)s7 * t0 + + (ulong)s6 * t1 + + (ulong)s5 * t2 + + (ulong)s4 * t3 + + (ulong)s3 * t4 + + (ulong)s2 * t5 + + (ulong)s1 * t6 + + (ulong)s0 * t7; + + c += f7 + g7; + z7 = (uint)c & M28; c >>= 28; + d += h7 - f7; + z15 = (uint)d & M28; d >>= 28; + + c += d; + + c += z8; + z8 = (uint)c & M28; c >>= 28; + d += z0; + z0 = (uint)d & M28; d >>= 28; + z9 += (uint)c; + z1 += (uint)d; + + z[0] = z0; + z[1] = z1; + z[2] = z2; + z[3] = z3; + z[4] = z4; + z[5] = z5; + z[6] = z6; + z[7] = z7; + z[8] = z8; + z[9] = z9; + z[10] = z10; + z[11] = z11; + z[12] = z12; + z[13] = z13; + z[14] = z14; + z[15] = z15; + } + + public static void Normalize(uint[] z) + { + //int x = (z[15] >> (28 - 1)) & 1; + Reduce(z, 1); + Reduce(z, -1); + Debug.Assert(z[15] >> 28 == 0U); + } + + private static void Reduce(uint[] z, int c) + { + uint z15 = z[15]; + long t = z15; + z15 &= M28; + t = (t >> 28) + c; + z[8] += (uint)t; + for (int i = 0; i < 15; ++i) + { + t += z[i]; z[i] = (uint)t & M28; t >>= 28; + } + z[15] = z15 + (uint)t; + } + + public static void Sqr(uint[] x, uint[] z) + { + uint x0 = x[0]; + uint x1 = x[1]; + uint x2 = x[2]; + uint x3 = x[3]; + uint x4 = x[4]; + uint x5 = x[5]; + uint x6 = x[6]; + uint x7 = x[7]; + + uint u0 = x[8]; + uint u1 = x[9]; + uint u2 = x[10]; + uint u3 = x[11]; + uint u4 = x[12]; + uint u5 = x[13]; + uint u6 = x[14]; + uint u7 = x[15]; + + uint x0_2 = x0 * 2; + uint x1_2 = x1 * 2; + uint x2_2 = x2 * 2; + uint x3_2 = x3 * 2; + uint x4_2 = x4 * 2; + uint x5_2 = x5 * 2; + uint x6_2 = x6 * 2; + + uint u0_2 = u0 * 2; + uint u1_2 = u1 * 2; + uint u2_2 = u2 * 2; + uint u3_2 = u3 * 2; + uint u4_2 = u4 * 2; + uint u5_2 = u5 * 2; + uint u6_2 = u6 * 2; + + uint s0 = x0 + u0; + uint s1 = x1 + u1; + uint s2 = x2 + u2; + uint s3 = x3 + u3; + uint s4 = x4 + u4; + uint s5 = x5 + u5; + uint s6 = x6 + u6; + uint s7 = x7 + u7; + + uint s0_2 = s0 * 2; + uint s1_2 = s1 * 2; + uint s2_2 = s2 * 2; + uint s3_2 = s3 * 2; + uint s4_2 = s4 * 2; + uint s5_2 = s5 * 2; + uint s6_2 = s6 * 2; + + uint z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, z15; + ulong c, d; + + ulong f0 = (ulong)x0 * x0; + ulong f8 = (ulong)x7 * x1_2 + + (ulong)x6 * x2_2 + + (ulong)x5 * x3_2 + + (ulong)x4 * x4; + ulong g0 = (ulong)u0 * u0; + ulong g8 = (ulong)u7 * u1_2 + + (ulong)u6 * u2_2 + + (ulong)u5 * u3_2 + + (ulong)u4 * u4; + ulong h0 = (ulong)s0 * s0; + ulong h8 = (ulong)s7 * s1_2 + + (ulong)s6 * s2_2 + + (ulong)s5 * s3_2 + + (ulong)s4 * s4; + + c = f0 + g0 + h8 - f8; + z0 = (uint)c & M28; c >>= 28; + d = g8 + h0 - f0 + h8; + z8 = (uint)d & M28; d >>= 28; + + ulong f1 = (ulong)x1 * x0_2; + ulong f9 = (ulong)x7 * x2_2 + + (ulong)x6 * x3_2 + + (ulong)x5 * x4_2; + ulong g1 = (ulong)u1 * u0_2; + ulong g9 = (ulong)u7 * u2_2 + + (ulong)u6 * u3_2 + + (ulong)u5 * u4_2; + ulong h1 = (ulong)s1 * s0_2; + ulong h9 = (ulong)s7 * s2_2 + + (ulong)s6 * s3_2 + + (ulong)s5 * s4_2; + + c += f1 + g1 + h9 - f9; + z1 = (uint)c & M28; c >>= 28; + d += g9 + h1 - f1 + h9; + z9 = (uint)d & M28; d >>= 28; + + ulong f2 = (ulong)x2 * x0_2 + + (ulong)x1 * x1; + ulong f10 = (ulong)x7 * x3_2 + + (ulong)x6 * x4_2 + + (ulong)x5 * x5; + ulong g2 = (ulong)u2 * u0_2 + + (ulong)u1 * u1; + ulong g10 = (ulong)u7 * u3_2 + + (ulong)u6 * u4_2 + + (ulong)u5 * u5; + ulong h2 = (ulong)s2 * s0_2 + + (ulong)s1 * s1; + ulong h10 = (ulong)s7 * s3_2 + + (ulong)s6 * s4_2 + + (ulong)s5 * s5; + + c += f2 + g2 + h10 - f10; + z2 = (uint)c & M28; c >>= 28; + d += g10 + h2 - f2 + h10; + z10 = (uint)d & M28; d >>= 28; + + ulong f3 = (ulong)x3 * x0_2 + + (ulong)x2 * x1_2; + ulong f11 = (ulong)x7 * x4_2 + + (ulong)x6 * x5_2; + ulong g3 = (ulong)u3 * u0_2 + + (ulong)u2 * u1_2; + ulong g11 = (ulong)u7 * u4_2 + + (ulong)u6 * u5_2; + ulong h3 = (ulong)s3 * s0_2 + + (ulong)s2 * s1_2; + ulong h11 = (ulong)s7 * s4_2 + + (ulong)s6 * s5_2; + + c += f3 + g3 + h11 - f11; + z3 = (uint)c & M28; c >>= 28; + d += g11 + h3 - f3 + h11; + z11 = (uint)d & M28; d >>= 28; + + ulong f4 = (ulong)x4 * x0_2 + + (ulong)x3 * x1_2 + + (ulong)x2 * x2; + ulong f12 = (ulong)x7 * x5_2 + + (ulong)x6 * x6; + ulong g4 = (ulong)u4 * u0_2 + + (ulong)u3 * u1_2 + + (ulong)u2 * u2; + ulong g12 = (ulong)u7 * u5_2 + + (ulong)u6 * u6; + ulong h4 = (ulong)s4 * s0_2 + + (ulong)s3 * s1_2 + + (ulong)s2 * s2; + ulong h12 = (ulong)s7 * s5_2 + + (ulong)s6 * s6; + + c += f4 + g4 + h12 - f12; + z4 = (uint)c & M28; c >>= 28; + d += g12 + h4 - f4 + h12; + z12 = (uint)d & M28; d >>= 28; + + ulong f5 = (ulong)x5 * x0_2 + + (ulong)x4 * x1_2 + + (ulong)x3 * x2_2; + ulong f13 = (ulong)x7 * x6_2; + ulong g5 = (ulong)u5 * u0_2 + + (ulong)u4 * u1_2 + + (ulong)u3 * u2_2; + ulong g13 = (ulong)u7 * u6_2; + ulong h5 = (ulong)s5 * s0_2 + + (ulong)s4 * s1_2 + + (ulong)s3 * s2_2; + ulong h13 = (ulong)s7 * s6_2; + + c += f5 + g5 + h13 - f13; + z5 = (uint)c & M28; c >>= 28; + d += g13 + h5 - f5 + h13; + z13 = (uint)d & M28; d >>= 28; + + ulong f6 = (ulong)x6 * x0_2 + + (ulong)x5 * x1_2 + + (ulong)x4 * x2_2 + + (ulong)x3 * x3; + ulong f14 = (ulong)x7 * x7; + ulong g6 = (ulong)u6 * u0_2 + + (ulong)u5 * u1_2 + + (ulong)u4 * u2_2 + + (ulong)u3 * u3; + ulong g14 = (ulong)u7 * u7; + ulong h6 = (ulong)s6 * s0_2 + + (ulong)s5 * s1_2 + + (ulong)s4 * s2_2 + + (ulong)s3 * s3; + ulong h14 = (ulong)s7 * s7; + + c += f6 + g6 + h14 - f14; + z6 = (uint)c & M28; c >>= 28; + d += g14 + h6 - f6 + h14; + z14 = (uint)d & M28; d >>= 28; + + ulong f7 = (ulong)x7 * x0_2 + + (ulong)x6 * x1_2 + + (ulong)x5 * x2_2 + + (ulong)x4 * x3_2; + ulong g7 = (ulong)u7 * u0_2 + + (ulong)u6 * u1_2 + + (ulong)u5 * u2_2 + + (ulong)u4 * u3_2; + ulong h7 = (ulong)s7 * s0_2 + + (ulong)s6 * s1_2 + + (ulong)s5 * s2_2 + + (ulong)s4 * s3_2; + + c += f7 + g7; + z7 = (uint)c & M28; c >>= 28; + d += h7 - f7; + z15 = (uint)d & M28; d >>= 28; + + c += d; + + c += z8; + z8 = (uint)c & M28; c >>= 28; + d += z0; + z0 = (uint)d & M28; d >>= 28; + z9 += (uint)c; + z1 += (uint)d; + + z[0] = z0; + z[1] = z1; + z[2] = z2; + z[3] = z3; + z[4] = z4; + z[5] = z5; + z[6] = z6; + z[7] = z7; + z[8] = z8; + z[9] = z9; + z[10] = z10; + z[11] = z11; + z[12] = z12; + z[13] = z13; + z[14] = z14; + z[15] = z15; + } + + public static void Sqr(uint[] x, int n, uint[] z) + { + Debug.Assert(n > 0); + + Sqr(x, z); + + while (--n > 0) + { + Sqr(z, z); + } + } + + public static void Sub(uint[] x, uint[] y, uint[] z) + { + uint x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7]; + uint x8 = x[8], x9 = x[9], x10 = x[10], x11 = x[11], x12 = x[12], x13 = x[13], x14 = x[14], x15 = x[15]; + uint y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4], y5 = y[5], y6 = y[6], y7 = y[7]; + uint y8 = y[8], y9 = y[9], y10 = y[10], y11 = y[11], y12 = y[12], y13 = y[13], y14 = y[14], y15 = y[15]; + + uint z0 = x0 + 0x1FFFFFFEU - y0; + uint z1 = x1 + 0x1FFFFFFEU - y1; + uint z2 = x2 + 0x1FFFFFFEU - y2; + uint z3 = x3 + 0x1FFFFFFEU - y3; + uint z4 = x4 + 0x1FFFFFFEU - y4; + uint z5 = x5 + 0x1FFFFFFEU - y5; + uint z6 = x6 + 0x1FFFFFFEU - y6; + uint z7 = x7 + 0x1FFFFFFEU - y7; + uint z8 = x8 + 0x1FFFFFFCU - y8; + uint z9 = x9 + 0x1FFFFFFEU - y9; + uint z10 = x10 + 0x1FFFFFFEU - y10; + uint z11 = x11 + 0x1FFFFFFEU - y11; + uint z12 = x12 + 0x1FFFFFFEU - y12; + uint z13 = x13 + 0x1FFFFFFEU - y13; + uint z14 = x14 + 0x1FFFFFFEU - y14; + uint z15 = x15 + 0x1FFFFFFEU - y15; + + z2 += z1 >> 28; z1 &= M28; + z6 += z5 >> 28; z5 &= M28; + z10 += z9 >> 28; z9 &= M28; + z14 += z13 >> 28; z13 &= M28; + + z3 += z2 >> 28; z2 &= M28; + z7 += z6 >> 28; z6 &= M28; + z11 += z10 >> 28; z10 &= M28; + z15 += z14 >> 28; z14 &= M28; + + uint t = z15 >> 28; z15 &= M28; + z0 += t; + z8 += t; + + z4 += z3 >> 28; z3 &= M28; + z8 += z7 >> 28; z7 &= M28; + z12 += z11 >> 28; z11 &= M28; + + z1 += z0 >> 28; z0 &= M28; + z5 += z4 >> 28; z4 &= M28; + z9 += z8 >> 28; z8 &= M28; + z13 += z12 >> 28; z12 &= M28; + + z[0] = z0; + z[1] = z1; + z[2] = z2; + z[3] = z3; + z[4] = z4; + z[5] = z5; + z[6] = z6; + z[7] = z7; + z[8] = z8; + z[9] = z9; + z[10] = z10; + z[11] = z11; + z[12] = z12; + z[13] = z13; + z[14] = z14; + z[15] = z15; + } + } +} |