summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Dettman <peter.dettman@bouncycastle.org>2023-04-24 18:30:38 +0700
committerPeter Dettman <peter.dettman@bouncycastle.org>2023-04-24 18:30:38 +0700
commit0f3fbabfff6072c9ba8f9fdcfe8084fd1c034072 (patch)
treed0f947e1f1937c73d46ba74ead1a3f5433163f0b
parentSparkleDIgest: improved tests and fixed regression (diff)
downloadBouncyCastle.NET-ed25519-0f3fbabfff6072c9ba8f9fdcfe8084fd1c034072.tar.xz
SparkleDigest perf. opts. (Sse2)
-rw-r--r--crypto/src/crypto/digests/SparkleDigest.cs370
1 files changed, 266 insertions, 104 deletions
diff --git a/crypto/src/crypto/digests/SparkleDigest.cs b/crypto/src/crypto/digests/SparkleDigest.cs
index 3729f3ee1..3a87d9e80 100644
--- a/crypto/src/crypto/digests/SparkleDigest.cs
+++ b/crypto/src/crypto/digests/SparkleDigest.cs
@@ -3,6 +3,12 @@ using System.Diagnostics;
 #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
 using System.Runtime.CompilerServices;
 #endif
+#if NETCOREAPP3_0_OR_GREATER
+using System.Buffers.Binary;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 using Org.BouncyCastle.Crypto.Utilities;
 using Org.BouncyCastle.Utilities;
@@ -182,19 +188,19 @@ namespace Org.BouncyCastle.Crypto.Digests
             // addition of last msg block (incl. padding)
             ProcessBlock(m_buf, 0, SPARKLE_STEPS_BIG);
 
-            Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff);
-
             if (STATE_UINTS == 16)
             {
+                OutputBlock16(output, outOff);
                 SparkleOpt16(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16);
+                OutputBlock16(output, outOff + 16);
                 SparkleOpt16(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 32);
+                OutputBlock16(output, outOff + 32);
             }
             else
             {
+                OutputBlock12(output, outOff);
                 SparkleOpt12(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16);
+                OutputBlock12(output, outOff + 16);
             }
 
             Reset();
@@ -225,19 +231,19 @@ namespace Org.BouncyCastle.Crypto.Digests
             // addition of last msg block (incl. padding)
             ProcessBlock(m_buf, SPARKLE_STEPS_BIG);
 
-            Pack.UInt32_To_LE(state[..RATE_UINTS], output);
-
             if (STATE_UINTS == 16)
             {
+                OutputBlock16(output);
                 SparkleOpt16(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]);
+                OutputBlock16(output[16..]);
                 SparkleOpt16(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state[..RATE_UINTS], output[32..]);
+                OutputBlock16(output[32..]);
             }
             else
             {
+                OutputBlock12(output);
                 SparkleOpt12(state, SPARKLE_STEPS_SLIM);
-                Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]);
+                OutputBlock12(output[16..]);
             }
 
             Reset();
@@ -252,6 +258,34 @@ namespace Org.BouncyCastle.Crypto.Digests
             m_bufPos = 0;
         }
 
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+        private void OutputBlock12(Span<byte> output)
+        {
+            Pack.UInt32_To_LE(state[..RATE_UINTS], output);
+        }
+
+        private void OutputBlock16(Span<byte> output)
+        {
+            Pack.UInt32_To_LE(state[0], output);
+            Pack.UInt32_To_LE(state[4], output[4..]);
+            Pack.UInt32_To_LE(state[1], output[8..]);
+            Pack.UInt32_To_LE(state[5], output[12..]);
+        }
+#else
+        private void OutputBlock12(byte[] output, int outOff)
+        {
+            Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff);
+        }
+
+        private void OutputBlock16(byte[] output, int outOff)
+        {
+            Pack.UInt32_To_LE(state[0], output, outOff);
+            Pack.UInt32_To_LE(state[4], output, outOff + 4);
+            Pack.UInt32_To_LE(state[1], output, outOff + 8);
+            Pack.UInt32_To_LE(state[5], output, outOff + 12);
+        }
+#endif
+
 #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
 #endif
@@ -274,20 +308,26 @@ namespace Org.BouncyCastle.Crypto.Digests
             // addition of a buffer block to the state
             uint tx = ELL(t0 ^ t2);
             uint ty = ELL(t1 ^ t3);
-            state[0] ^= t0 ^ ty;
-            state[1] ^= t1 ^ tx;
-            state[2] ^= t2 ^ ty;
-            state[3] ^= t3 ^ tx;
-            state[4] ^= ty;
-            state[5] ^= tx;
             if (STATE_UINTS == 16)
             {
-                state[6] ^= ty;
+                state[0] ^= t0 ^ ty;
+                state[1] ^= t2 ^ ty;
+                state[2] ^= ty;
+                state[3] ^= ty;
+                state[4] ^= t1 ^ tx;
+                state[5] ^= t3 ^ tx;
+                state[6] ^= tx;
                 state[7] ^= tx;
                 SparkleOpt16(state, steps);
             }
             else
             {
+                state[0] ^= t0 ^ ty;
+                state[1] ^= t1 ^ tx;
+                state[2] ^= t2 ^ ty;
+                state[3] ^= t3 ^ tx;
+                state[4] ^= ty;
+                state[5] ^= tx;
                 SparkleOpt12(state, steps);
             }
         }
@@ -368,113 +408,159 @@ namespace Org.BouncyCastle.Crypto.Digests
         {
             Debug.Assert((steps & 1) == 0);
 
-            uint s00 = state[ 0];
-            uint s01 = state[ 1];
-            uint s02 = state[ 2];
-            uint s03 = state[ 3];
-            uint s04 = state[ 4];
-            uint s05 = state[ 5];
-            uint s06 = state[ 6];
-            uint s07 = state[ 7];
-            uint s08 = state[ 8];
-            uint s09 = state[ 9];
-            uint s10 = state[10];
-            uint s11 = state[11];
-            uint s12 = state[12];
-            uint s13 = state[13];
-            uint s14 = state[14];
-            uint s15 = state[15];
+#if NETCOREAPP3_0_OR_GREATER
+            if (Sse2.IsSupported)
+            {
+                var s0246 = Load128(state.AsSpan(0));
+                var s1357 = Load128(state.AsSpan(4));
+                var s8ACE = Load128(state.AsSpan(8));
+                var s9BDF = Load128(state.AsSpan(12));
+
+                var RC03 = Load128(RCON.AsSpan(0));
+                var RC47 = Load128(RCON.AsSpan(4));
+
+                for (int step = 0; step < steps; ++step)
+                {
+                    // Add round ant
+
+                    s1357 = Sse2.Xor(s1357, Vector128.Create(RCON[step & 7], (uint)step, 0U, 0U));
+
+                    // ARXBOX layer
+
+                    ArxBoxRound(RC03, ref s0246, ref s1357);
+                    ArxBoxRound(RC47, ref s8ACE, ref s9BDF);
+
+                    // Linear layer
+
+                    var t0246 = ELL(HorizontalXor(s0246));
+                    var t1357 = ELL(HorizontalXor(s1357));
+
+                    var u0246 = Sse2.Xor(s0246, s8ACE);
+                    var u1357 = Sse2.Xor(s1357, s9BDF);
 
-            int step = 0;
-            while (step < steps)
+                    s8ACE = s0246;
+                    s9BDF = s1357;
+
+                    s0246 = Sse2.Xor(t1357, Sse2.Shuffle(u0246, 0x39));
+                    s1357 = Sse2.Xor(t0246, Sse2.Shuffle(u1357, 0x39));
+                }
+
+                Store128(s0246, state.AsSpan(0));
+                Store128(s1357, state.AsSpan(4));
+                Store128(s8ACE, state.AsSpan(8));
+                Store128(s9BDF, state.AsSpan(12));
+            }
+            else
+#endif
             {
-                // STEP 1
+                uint s00 = state[ 0];
+                uint s02 = state[ 1];
+                uint s04 = state[ 2];
+                uint s06 = state[ 3];
+                uint s01 = state[ 4];
+                uint s03 = state[ 5];
+                uint s05 = state[ 6];
+                uint s07 = state[ 7];
+                uint s08 = state[ 8];
+                uint s10 = state[ 9];
+                uint s12 = state[10];
+                uint s14 = state[11];
+                uint s09 = state[12];
+                uint s11 = state[13];
+                uint s13 = state[14];
+                uint s15 = state[15];
+
+                int step = 0;
+                while (step < steps)
+                {
+                    // STEP 1
 
-                // Add round ant
+                    // Add round ant
 
-                s01 ^= RCON[step & 7];
-                s03 ^= (uint)(step++);
+                    s01 ^= RCON[step & 7];
+                    s03 ^= (uint)(step++);
 
-                // ARXBOX layer
+                    // ARXBOX layer
 
-                ArxBoxRound(RCON[0], ref s00, ref s01);
-                ArxBoxRound(RCON[1], ref s02, ref s03);
-                ArxBoxRound(RCON[2], ref s04, ref s05);
-                ArxBoxRound(RCON[3], ref s06, ref s07);
-                ArxBoxRound(RCON[4], ref s08, ref s09);
-                ArxBoxRound(RCON[5], ref s10, ref s11);
-                ArxBoxRound(RCON[6], ref s12, ref s13);
-                ArxBoxRound(RCON[7], ref s14, ref s15);
+                    ArxBoxRound(RCON[0], ref s00, ref s01);
+                    ArxBoxRound(RCON[1], ref s02, ref s03);
+                    ArxBoxRound(RCON[2], ref s04, ref s05);
+                    ArxBoxRound(RCON[3], ref s06, ref s07);
+                    ArxBoxRound(RCON[4], ref s08, ref s09);
+                    ArxBoxRound(RCON[5], ref s10, ref s11);
+                    ArxBoxRound(RCON[6], ref s12, ref s13);
+                    ArxBoxRound(RCON[7], ref s14, ref s15);
 
-                // Linear layer
+                    // Linear layer
 
-                uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06);
-                uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07);
+                    uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06);
+                    uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07);
 
-                uint u08 = s08;
-                uint u09 = s09;
+                    uint u08 = s08;
+                    uint u09 = s09;
 
-                s08 = s02 ^ s10 ^ t1357;
-                s09 = s03 ^ s11 ^ t0246;
-                s10 = s04 ^ s12 ^ t1357;
-                s11 = s05 ^ s13 ^ t0246;
-                s12 = s06 ^ s14 ^ t1357;
-                s13 = s07 ^ s15 ^ t0246;
-                s14 = s00 ^ u08 ^ t1357;
-                s15 = s01 ^ u09 ^ t0246;
+                    s08 = s02 ^ s10 ^ t1357;
+                    s09 = s03 ^ s11 ^ t0246;
+                    s10 = s04 ^ s12 ^ t1357;
+                    s11 = s05 ^ s13 ^ t0246;
+                    s12 = s06 ^ s14 ^ t1357;
+                    s13 = s07 ^ s15 ^ t0246;
+                    s14 = s00 ^ u08 ^ t1357;
+                    s15 = s01 ^ u09 ^ t0246;
 
-                // STEP 2
+                    // STEP 2
 
-                // Add round ant
+                    // Add round ant
 
-                s09 ^= RCON[step & 7];
-                s11 ^= (uint)(step++);
+                    s09 ^= RCON[step & 7];
+                    s11 ^= (uint)(step++);
 
-                // ARXBOX layer
+                    // ARXBOX layer
 
-                ArxBoxRound(RCON[0], ref s08, ref s09);
-                ArxBoxRound(RCON[1], ref s10, ref s11);
-                ArxBoxRound(RCON[2], ref s12, ref s13);
-                ArxBoxRound(RCON[3], ref s14, ref s15);
-                ArxBoxRound(RCON[4], ref s00, ref s01);
-                ArxBoxRound(RCON[5], ref s02, ref s03);
-                ArxBoxRound(RCON[6], ref s04, ref s05);
-                ArxBoxRound(RCON[7], ref s06, ref s07);
+                    ArxBoxRound(RCON[0], ref s08, ref s09);
+                    ArxBoxRound(RCON[1], ref s10, ref s11);
+                    ArxBoxRound(RCON[2], ref s12, ref s13);
+                    ArxBoxRound(RCON[3], ref s14, ref s15);
+                    ArxBoxRound(RCON[4], ref s00, ref s01);
+                    ArxBoxRound(RCON[5], ref s02, ref s03);
+                    ArxBoxRound(RCON[6], ref s04, ref s05);
+                    ArxBoxRound(RCON[7], ref s06, ref s07);
 
-                // Linear layer
+                    // Linear layer
 
-                uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14);
-                uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15);
+                    uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14);
+                    uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15);
 
-                uint u00 = s00;
-                uint u01 = s01;
+                    uint u00 = s00;
+                    uint u01 = s01;
 
-                s00 = s02 ^ s10 ^ t9BDF;
-                s01 = s03 ^ s11 ^ t8ACE;
-                s02 = s04 ^ s12 ^ t9BDF;
-                s03 = s05 ^ s13 ^ t8ACE;
-                s04 = s06 ^ s14 ^ t9BDF;
-                s05 = s07 ^ s15 ^ t8ACE;
-                s06 = u00 ^ s08 ^ t9BDF;
-                s07 = u01 ^ s09 ^ t8ACE;
-            }
+                    s00 = s02 ^ s10 ^ t9BDF;
+                    s01 = s03 ^ s11 ^ t8ACE;
+                    s02 = s04 ^ s12 ^ t9BDF;
+                    s03 = s05 ^ s13 ^ t8ACE;
+                    s04 = s06 ^ s14 ^ t9BDF;
+                    s05 = s07 ^ s15 ^ t8ACE;
+                    s06 = u00 ^ s08 ^ t9BDF;
+                    s07 = u01 ^ s09 ^ t8ACE;
+                }
 
-            state[ 0] = s00;
-            state[ 1] = s01;
-            state[ 2] = s02;
-            state[ 3] = s03;
-            state[ 4] = s04;
-            state[ 5] = s05;
-            state[ 6] = s06;
-            state[ 7] = s07;
-            state[ 8] = s08;
-            state[ 9] = s09;
-            state[10] = s10;
-            state[11] = s11;
-            state[12] = s12;
-            state[13] = s13;
-            state[14] = s14;
-            state[15] = s15;
+                state[ 0] = s00;
+                state[ 1] = s02;
+                state[ 2] = s04;
+                state[ 3] = s06;
+                state[ 4] = s01;
+                state[ 5] = s03;
+                state[ 6] = s05;
+                state[ 7] = s07;
+                state[ 8] = s08;
+                state[ 9] = s10;
+                state[10] = s12;
+                state[11] = s14;
+                state[12] = s09;
+                state[13] = s11;
+                state[14] = s13;
+                state[15] = s15;
+            }
         }
 
 #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
@@ -503,5 +589,81 @@ namespace Org.BouncyCastle.Crypto.Digests
         {
             return Integers.RotateRight(x, 16) ^ (x & 0xFFFFU);
         }
+
+#if NETCOREAPP3_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void ArxBoxRound(Vector128<uint> rc, ref Vector128<uint> s00, ref Vector128<uint> s01)
+        {
+            s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 31));
+            s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 1));
+
+            s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 24));
+            s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 8));
+
+            s00 = Sse2.Xor(s00, rc);
+
+            s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 17));
+            s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 15));
+
+            s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 17));
+            s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 15));
+
+            s00 = Sse2.Xor(s00, rc);
+
+            s00 = Sse2.Add(s00, s01);
+
+            s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 31));
+            s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 1));
+
+            s00 = Sse2.Xor(s00, rc);
+
+            s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 24));
+            s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 8));
+
+            s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 16));
+            s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 16));
+
+            s00 = Sse2.Xor(s00, rc);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<uint> ELL(Vector128<uint> x)
+        {
+            var t = Sse2.ShiftLeftLogical(x, 16);
+            var u = Sse2.Xor(x, t);
+            return Sse2.Xor(t, Sse2.ShiftRightLogical(u, 16));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<uint> HorizontalXor(Vector128<uint> x)
+        {
+            var t = Sse2.Xor(x, Sse2.Shuffle(x, 0x1B));
+            return Sse2.Xor(t, Sse2.Shuffle(t, 0xB1));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<uint> Load128(ReadOnlySpan<uint> t)
+        {
+            if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+                return MemoryMarshal.Read<Vector128<uint>>(MemoryMarshal.AsBytes(t));
+
+            return Vector128.Create(t[0], t[1], t[2], t[3]);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void Store128(Vector128<uint> s, Span<uint> t)
+        {
+            var b = MemoryMarshal.AsBytes(t);
+            if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+            {
+                MemoryMarshal.Write(b, ref s);
+                return;
+            }
+
+            var u = s.AsUInt64();
+            BinaryPrimitives.WriteUInt64LittleEndian(b[..8], u.GetElement(0));
+            BinaryPrimitives.WriteUInt64LittleEndian(b[8..], u.GetElement(1));
+        }
+#endif
     }
 }