diff --git a/crypto/src/crypto/digests/SparkleDigest.cs b/crypto/src/crypto/digests/SparkleDigest.cs
index 3729f3ee1..3a87d9e80 100644
--- a/crypto/src/crypto/digests/SparkleDigest.cs
+++ b/crypto/src/crypto/digests/SparkleDigest.cs
@@ -3,6 +3,12 @@ using System.Diagnostics;
#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
using System.Runtime.CompilerServices;
#endif
+#if NETCOREAPP3_0_OR_GREATER
+using System.Buffers.Binary;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using Org.BouncyCastle.Crypto.Utilities;
using Org.BouncyCastle.Utilities;
@@ -182,19 +188,19 @@ namespace Org.BouncyCastle.Crypto.Digests
// addition of last msg block (incl. padding)
ProcessBlock(m_buf, 0, SPARKLE_STEPS_BIG);
- Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff);
-
if (STATE_UINTS == 16)
{
+ OutputBlock16(output, outOff);
SparkleOpt16(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16);
+ OutputBlock16(output, outOff + 16);
SparkleOpt16(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 32);
+ OutputBlock16(output, outOff + 32);
}
else
{
+ OutputBlock12(output, outOff);
SparkleOpt12(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16);
+ OutputBlock12(output, outOff + 16);
}
Reset();
@@ -225,19 +231,19 @@ namespace Org.BouncyCastle.Crypto.Digests
// addition of last msg block (incl. padding)
ProcessBlock(m_buf, SPARKLE_STEPS_BIG);
- Pack.UInt32_To_LE(state[..RATE_UINTS], output);
-
if (STATE_UINTS == 16)
{
+ OutputBlock16(output);
SparkleOpt16(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]);
+ OutputBlock16(output[16..]);
SparkleOpt16(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state[..RATE_UINTS], output[32..]);
+ OutputBlock16(output[32..]);
}
else
{
+ OutputBlock12(output);
SparkleOpt12(state, SPARKLE_STEPS_SLIM);
- Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]);
+ OutputBlock12(output[16..]);
}
Reset();
@@ -252,6 +258,34 @@ namespace Org.BouncyCastle.Crypto.Digests
m_bufPos = 0;
}
+#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
+ private void OutputBlock12(Span<byte> output)
+ {
+ Pack.UInt32_To_LE(state[..RATE_UINTS], output);
+ }
+
+ private void OutputBlock16(Span<byte> output)
+ {
+ Pack.UInt32_To_LE(state[0], output);
+ Pack.UInt32_To_LE(state[4], output[4..]);
+ Pack.UInt32_To_LE(state[1], output[8..]);
+ Pack.UInt32_To_LE(state[5], output[12..]);
+ }
+#else
+ private void OutputBlock12(byte[] output, int outOff)
+ {
+ Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff);
+ }
+
+ private void OutputBlock16(byte[] output, int outOff)
+ {
+ Pack.UInt32_To_LE(state[0], output, outOff);
+ Pack.UInt32_To_LE(state[4], output, outOff + 4);
+ Pack.UInt32_To_LE(state[1], output, outOff + 8);
+ Pack.UInt32_To_LE(state[5], output, outOff + 12);
+ }
+#endif
+
#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
[MethodImpl(MethodImplOptions.AggressiveInlining)]
#endif
@@ -274,20 +308,26 @@ namespace Org.BouncyCastle.Crypto.Digests
// addition of a buffer block to the state
uint tx = ELL(t0 ^ t2);
uint ty = ELL(t1 ^ t3);
- state[0] ^= t0 ^ ty;
- state[1] ^= t1 ^ tx;
- state[2] ^= t2 ^ ty;
- state[3] ^= t3 ^ tx;
- state[4] ^= ty;
- state[5] ^= tx;
if (STATE_UINTS == 16)
{
- state[6] ^= ty;
+ state[0] ^= t0 ^ ty;
+ state[1] ^= t2 ^ ty;
+ state[2] ^= ty;
+ state[3] ^= ty;
+ state[4] ^= t1 ^ tx;
+ state[5] ^= t3 ^ tx;
+ state[6] ^= tx;
state[7] ^= tx;
SparkleOpt16(state, steps);
}
else
{
+ state[0] ^= t0 ^ ty;
+ state[1] ^= t1 ^ tx;
+ state[2] ^= t2 ^ ty;
+ state[3] ^= t3 ^ tx;
+ state[4] ^= ty;
+ state[5] ^= tx;
SparkleOpt12(state, steps);
}
}
@@ -368,113 +408,159 @@ namespace Org.BouncyCastle.Crypto.Digests
{
Debug.Assert((steps & 1) == 0);
- uint s00 = state[ 0];
- uint s01 = state[ 1];
- uint s02 = state[ 2];
- uint s03 = state[ 3];
- uint s04 = state[ 4];
- uint s05 = state[ 5];
- uint s06 = state[ 6];
- uint s07 = state[ 7];
- uint s08 = state[ 8];
- uint s09 = state[ 9];
- uint s10 = state[10];
- uint s11 = state[11];
- uint s12 = state[12];
- uint s13 = state[13];
- uint s14 = state[14];
- uint s15 = state[15];
+#if NETCOREAPP3_0_OR_GREATER
+ if (Sse2.IsSupported)
+ {
+ var s0246 = Load128(state.AsSpan(0));
+ var s1357 = Load128(state.AsSpan(4));
+ var s8ACE = Load128(state.AsSpan(8));
+ var s9BDF = Load128(state.AsSpan(12));
+
+ var RC03 = Load128(RCON.AsSpan(0));
+ var RC47 = Load128(RCON.AsSpan(4));
+
+ for (int step = 0; step < steps; ++step)
+ {
+ // Add round ant
+
+ s1357 = Sse2.Xor(s1357, Vector128.Create(RCON[step & 7], (uint)step, 0U, 0U));
+
+ // ARXBOX layer
+
+ ArxBoxRound(RC03, ref s0246, ref s1357);
+ ArxBoxRound(RC47, ref s8ACE, ref s9BDF);
+
+ // Linear layer
+
+ var t0246 = ELL(HorizontalXor(s0246));
+ var t1357 = ELL(HorizontalXor(s1357));
+
+ var u0246 = Sse2.Xor(s0246, s8ACE);
+ var u1357 = Sse2.Xor(s1357, s9BDF);
- int step = 0;
- while (step < steps)
+ s8ACE = s0246;
+ s9BDF = s1357;
+
+ s0246 = Sse2.Xor(t1357, Sse2.Shuffle(u0246, 0x39));
+ s1357 = Sse2.Xor(t0246, Sse2.Shuffle(u1357, 0x39));
+ }
+
+ Store128(s0246, state.AsSpan(0));
+ Store128(s1357, state.AsSpan(4));
+ Store128(s8ACE, state.AsSpan(8));
+ Store128(s9BDF, state.AsSpan(12));
+ }
+ else
+#endif
{
- // STEP 1
+ uint s00 = state[ 0];
+ uint s02 = state[ 1];
+ uint s04 = state[ 2];
+ uint s06 = state[ 3];
+ uint s01 = state[ 4];
+ uint s03 = state[ 5];
+ uint s05 = state[ 6];
+ uint s07 = state[ 7];
+ uint s08 = state[ 8];
+ uint s10 = state[ 9];
+ uint s12 = state[10];
+ uint s14 = state[11];
+ uint s09 = state[12];
+ uint s11 = state[13];
+ uint s13 = state[14];
+ uint s15 = state[15];
+
+ int step = 0;
+ while (step < steps)
+ {
+ // STEP 1
- // Add round ant
+ // Add round ant
- s01 ^= RCON[step & 7];
- s03 ^= (uint)(step++);
+ s01 ^= RCON[step & 7];
+ s03 ^= (uint)(step++);
- // ARXBOX layer
+ // ARXBOX layer
- ArxBoxRound(RCON[0], ref s00, ref s01);
- ArxBoxRound(RCON[1], ref s02, ref s03);
- ArxBoxRound(RCON[2], ref s04, ref s05);
- ArxBoxRound(RCON[3], ref s06, ref s07);
- ArxBoxRound(RCON[4], ref s08, ref s09);
- ArxBoxRound(RCON[5], ref s10, ref s11);
- ArxBoxRound(RCON[6], ref s12, ref s13);
- ArxBoxRound(RCON[7], ref s14, ref s15);
+ ArxBoxRound(RCON[0], ref s00, ref s01);
+ ArxBoxRound(RCON[1], ref s02, ref s03);
+ ArxBoxRound(RCON[2], ref s04, ref s05);
+ ArxBoxRound(RCON[3], ref s06, ref s07);
+ ArxBoxRound(RCON[4], ref s08, ref s09);
+ ArxBoxRound(RCON[5], ref s10, ref s11);
+ ArxBoxRound(RCON[6], ref s12, ref s13);
+ ArxBoxRound(RCON[7], ref s14, ref s15);
- // Linear layer
+ // Linear layer
- uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06);
- uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07);
+ uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06);
+ uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07);
- uint u08 = s08;
- uint u09 = s09;
+ uint u08 = s08;
+ uint u09 = s09;
- s08 = s02 ^ s10 ^ t1357;
- s09 = s03 ^ s11 ^ t0246;
- s10 = s04 ^ s12 ^ t1357;
- s11 = s05 ^ s13 ^ t0246;
- s12 = s06 ^ s14 ^ t1357;
- s13 = s07 ^ s15 ^ t0246;
- s14 = s00 ^ u08 ^ t1357;
- s15 = s01 ^ u09 ^ t0246;
+ s08 = s02 ^ s10 ^ t1357;
+ s09 = s03 ^ s11 ^ t0246;
+ s10 = s04 ^ s12 ^ t1357;
+ s11 = s05 ^ s13 ^ t0246;
+ s12 = s06 ^ s14 ^ t1357;
+ s13 = s07 ^ s15 ^ t0246;
+ s14 = s00 ^ u08 ^ t1357;
+ s15 = s01 ^ u09 ^ t0246;
- // STEP 2
+ // STEP 2
- // Add round ant
+ // Add round ant
- s09 ^= RCON[step & 7];
- s11 ^= (uint)(step++);
+ s09 ^= RCON[step & 7];
+ s11 ^= (uint)(step++);
- // ARXBOX layer
+ // ARXBOX layer
- ArxBoxRound(RCON[0], ref s08, ref s09);
- ArxBoxRound(RCON[1], ref s10, ref s11);
- ArxBoxRound(RCON[2], ref s12, ref s13);
- ArxBoxRound(RCON[3], ref s14, ref s15);
- ArxBoxRound(RCON[4], ref s00, ref s01);
- ArxBoxRound(RCON[5], ref s02, ref s03);
- ArxBoxRound(RCON[6], ref s04, ref s05);
- ArxBoxRound(RCON[7], ref s06, ref s07);
+ ArxBoxRound(RCON[0], ref s08, ref s09);
+ ArxBoxRound(RCON[1], ref s10, ref s11);
+ ArxBoxRound(RCON[2], ref s12, ref s13);
+ ArxBoxRound(RCON[3], ref s14, ref s15);
+ ArxBoxRound(RCON[4], ref s00, ref s01);
+ ArxBoxRound(RCON[5], ref s02, ref s03);
+ ArxBoxRound(RCON[6], ref s04, ref s05);
+ ArxBoxRound(RCON[7], ref s06, ref s07);
- // Linear layer
+ // Linear layer
- uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14);
- uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15);
+ uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14);
+ uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15);
- uint u00 = s00;
- uint u01 = s01;
+ uint u00 = s00;
+ uint u01 = s01;
- s00 = s02 ^ s10 ^ t9BDF;
- s01 = s03 ^ s11 ^ t8ACE;
- s02 = s04 ^ s12 ^ t9BDF;
- s03 = s05 ^ s13 ^ t8ACE;
- s04 = s06 ^ s14 ^ t9BDF;
- s05 = s07 ^ s15 ^ t8ACE;
- s06 = u00 ^ s08 ^ t9BDF;
- s07 = u01 ^ s09 ^ t8ACE;
- }
+ s00 = s02 ^ s10 ^ t9BDF;
+ s01 = s03 ^ s11 ^ t8ACE;
+ s02 = s04 ^ s12 ^ t9BDF;
+ s03 = s05 ^ s13 ^ t8ACE;
+ s04 = s06 ^ s14 ^ t9BDF;
+ s05 = s07 ^ s15 ^ t8ACE;
+ s06 = u00 ^ s08 ^ t9BDF;
+ s07 = u01 ^ s09 ^ t8ACE;
+ }
- state[ 0] = s00;
- state[ 1] = s01;
- state[ 2] = s02;
- state[ 3] = s03;
- state[ 4] = s04;
- state[ 5] = s05;
- state[ 6] = s06;
- state[ 7] = s07;
- state[ 8] = s08;
- state[ 9] = s09;
- state[10] = s10;
- state[11] = s11;
- state[12] = s12;
- state[13] = s13;
- state[14] = s14;
- state[15] = s15;
+ state[ 0] = s00;
+ state[ 1] = s02;
+ state[ 2] = s04;
+ state[ 3] = s06;
+ state[ 4] = s01;
+ state[ 5] = s03;
+ state[ 6] = s05;
+ state[ 7] = s07;
+ state[ 8] = s08;
+ state[ 9] = s10;
+ state[10] = s12;
+ state[11] = s14;
+ state[12] = s09;
+ state[13] = s11;
+ state[14] = s13;
+ state[15] = s15;
+ }
}
#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER
@@ -503,5 +589,81 @@ namespace Org.BouncyCastle.Crypto.Digests
{
return Integers.RotateRight(x, 16) ^ (x & 0xFFFFU);
}
+
+#if NETCOREAPP3_0_OR_GREATER
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void ArxBoxRound(Vector128<uint> rc, ref Vector128<uint> s00, ref Vector128<uint> s01)
+ {
+ s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 31));
+ s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 1));
+
+ s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 24));
+ s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 8));
+
+ s00 = Sse2.Xor(s00, rc);
+
+ s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 17));
+ s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 15));
+
+ s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 17));
+ s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 15));
+
+ s00 = Sse2.Xor(s00, rc);
+
+ s00 = Sse2.Add(s00, s01);
+
+ s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 31));
+ s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 1));
+
+ s00 = Sse2.Xor(s00, rc);
+
+ s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 24));
+ s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 8));
+
+ s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 16));
+ s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 16));
+
+ s00 = Sse2.Xor(s00, rc);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<uint> ELL(Vector128<uint> x)
+ {
+ var t = Sse2.ShiftLeftLogical(x, 16);
+ var u = Sse2.Xor(x, t);
+ return Sse2.Xor(t, Sse2.ShiftRightLogical(u, 16));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<uint> HorizontalXor(Vector128<uint> x)
+ {
+ var t = Sse2.Xor(x, Sse2.Shuffle(x, 0x1B));
+ return Sse2.Xor(t, Sse2.Shuffle(t, 0xB1));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<uint> Load128(ReadOnlySpan<uint> t)
+ {
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+ return MemoryMarshal.Read<Vector128<uint>>(MemoryMarshal.AsBytes(t));
+
+ return Vector128.Create(t[0], t[1], t[2], t[3]);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void Store128(Vector128<uint> s, Span<uint> t)
+ {
+ var b = MemoryMarshal.AsBytes(t);
+ if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16)
+ {
+ MemoryMarshal.Write(b, ref s);
+ return;
+ }
+
+ var u = s.AsUInt64();
+ BinaryPrimitives.WriteUInt64LittleEndian(b[..8], u.GetElement(0));
+ BinaryPrimitives.WriteUInt64LittleEndian(b[8..], u.GetElement(1));
+ }
+#endif
}
}
|