diff options
author | Peter Dettman <peter.dettman@bouncycastle.org> | 2023-04-24 18:30:38 +0700 |
---|---|---|
committer | Peter Dettman <peter.dettman@bouncycastle.org> | 2023-04-24 18:30:38 +0700 |
commit | 0f3fbabfff6072c9ba8f9fdcfe8084fd1c034072 (patch) | |
tree | d0f947e1f1937c73d46ba74ead1a3f5433163f0b | |
parent | SparkleDIgest: improved tests and fixed regression (diff) | |
download | BouncyCastle.NET-ed25519-0f3fbabfff6072c9ba8f9fdcfe8084fd1c034072.tar.xz |
SparkleDigest perf. opts. (Sse2)
-rw-r--r-- | crypto/src/crypto/digests/SparkleDigest.cs | 370 |
1 files changed, 266 insertions, 104 deletions
diff --git a/crypto/src/crypto/digests/SparkleDigest.cs b/crypto/src/crypto/digests/SparkleDigest.cs index 3729f3ee1..3a87d9e80 100644 --- a/crypto/src/crypto/digests/SparkleDigest.cs +++ b/crypto/src/crypto/digests/SparkleDigest.cs @@ -3,6 +3,12 @@ using System.Diagnostics; #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER using System.Runtime.CompilerServices; #endif +#if NETCOREAPP3_0_OR_GREATER +using System.Buffers.Binary; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Utilities; @@ -182,19 +188,19 @@ namespace Org.BouncyCastle.Crypto.Digests // addition of last msg block (incl. padding) ProcessBlock(m_buf, 0, SPARKLE_STEPS_BIG); - Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff); - if (STATE_UINTS == 16) { + OutputBlock16(output, outOff); SparkleOpt16(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16); + OutputBlock16(output, outOff + 16); SparkleOpt16(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 32); + OutputBlock16(output, outOff + 32); } else { + OutputBlock12(output, outOff); SparkleOpt12(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff + 16); + OutputBlock12(output, outOff + 16); } Reset(); @@ -225,19 +231,19 @@ namespace Org.BouncyCastle.Crypto.Digests // addition of last msg block (incl. padding) ProcessBlock(m_buf, SPARKLE_STEPS_BIG); - Pack.UInt32_To_LE(state[..RATE_UINTS], output); - if (STATE_UINTS == 16) { + OutputBlock16(output); SparkleOpt16(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]); + OutputBlock16(output[16..]); SparkleOpt16(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state[..RATE_UINTS], output[32..]); + OutputBlock16(output[32..]); } else { + OutputBlock12(output); SparkleOpt12(state, SPARKLE_STEPS_SLIM); - Pack.UInt32_To_LE(state[..RATE_UINTS], output[16..]); + OutputBlock12(output[16..]); } Reset(); @@ -252,6 +258,34 @@ namespace Org.BouncyCastle.Crypto.Digests m_bufPos = 0; } +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + private void OutputBlock12(Span<byte> output) + { + Pack.UInt32_To_LE(state[..RATE_UINTS], output); + } + + private void OutputBlock16(Span<byte> output) + { + Pack.UInt32_To_LE(state[0], output); + Pack.UInt32_To_LE(state[4], output[4..]); + Pack.UInt32_To_LE(state[1], output[8..]); + Pack.UInt32_To_LE(state[5], output[12..]); + } +#else + private void OutputBlock12(byte[] output, int outOff) + { + Pack.UInt32_To_LE(state, 0, RATE_UINTS, output, outOff); + } + + private void OutputBlock16(byte[] output, int outOff) + { + Pack.UInt32_To_LE(state[0], output, outOff); + Pack.UInt32_To_LE(state[4], output, outOff + 4); + Pack.UInt32_To_LE(state[1], output, outOff + 8); + Pack.UInt32_To_LE(state[5], output, outOff + 12); + } +#endif + #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER [MethodImpl(MethodImplOptions.AggressiveInlining)] #endif @@ -274,20 +308,26 @@ namespace Org.BouncyCastle.Crypto.Digests // addition of a buffer block to the state uint tx = ELL(t0 ^ t2); uint ty = ELL(t1 ^ t3); - state[0] ^= t0 ^ ty; - state[1] ^= t1 ^ tx; - state[2] ^= t2 ^ ty; - state[3] ^= t3 ^ tx; - state[4] ^= ty; - state[5] ^= tx; if (STATE_UINTS == 16) { - state[6] ^= ty; + state[0] ^= t0 ^ ty; + state[1] ^= t2 ^ ty; + state[2] ^= ty; + state[3] ^= ty; + state[4] ^= t1 ^ tx; + state[5] ^= t3 ^ tx; + state[6] ^= tx; state[7] ^= tx; SparkleOpt16(state, steps); } else { + state[0] ^= t0 ^ ty; + state[1] ^= t1 ^ tx; + state[2] ^= t2 ^ ty; + state[3] ^= t3 ^ tx; + state[4] ^= ty; + state[5] ^= tx; SparkleOpt12(state, steps); } } @@ -368,113 +408,159 @@ namespace Org.BouncyCastle.Crypto.Digests { Debug.Assert((steps & 1) == 0); - uint s00 = state[ 0]; - uint s01 = state[ 1]; - uint s02 = state[ 2]; - uint s03 = state[ 3]; - uint s04 = state[ 4]; - uint s05 = state[ 5]; - uint s06 = state[ 6]; - uint s07 = state[ 7]; - uint s08 = state[ 8]; - uint s09 = state[ 9]; - uint s10 = state[10]; - uint s11 = state[11]; - uint s12 = state[12]; - uint s13 = state[13]; - uint s14 = state[14]; - uint s15 = state[15]; +#if NETCOREAPP3_0_OR_GREATER + if (Sse2.IsSupported) + { + var s0246 = Load128(state.AsSpan(0)); + var s1357 = Load128(state.AsSpan(4)); + var s8ACE = Load128(state.AsSpan(8)); + var s9BDF = Load128(state.AsSpan(12)); + + var RC03 = Load128(RCON.AsSpan(0)); + var RC47 = Load128(RCON.AsSpan(4)); + + for (int step = 0; step < steps; ++step) + { + // Add round ant + + s1357 = Sse2.Xor(s1357, Vector128.Create(RCON[step & 7], (uint)step, 0U, 0U)); + + // ARXBOX layer + + ArxBoxRound(RC03, ref s0246, ref s1357); + ArxBoxRound(RC47, ref s8ACE, ref s9BDF); + + // Linear layer + + var t0246 = ELL(HorizontalXor(s0246)); + var t1357 = ELL(HorizontalXor(s1357)); + + var u0246 = Sse2.Xor(s0246, s8ACE); + var u1357 = Sse2.Xor(s1357, s9BDF); - int step = 0; - while (step < steps) + s8ACE = s0246; + s9BDF = s1357; + + s0246 = Sse2.Xor(t1357, Sse2.Shuffle(u0246, 0x39)); + s1357 = Sse2.Xor(t0246, Sse2.Shuffle(u1357, 0x39)); + } + + Store128(s0246, state.AsSpan(0)); + Store128(s1357, state.AsSpan(4)); + Store128(s8ACE, state.AsSpan(8)); + Store128(s9BDF, state.AsSpan(12)); + } + else +#endif { - // STEP 1 + uint s00 = state[ 0]; + uint s02 = state[ 1]; + uint s04 = state[ 2]; + uint s06 = state[ 3]; + uint s01 = state[ 4]; + uint s03 = state[ 5]; + uint s05 = state[ 6]; + uint s07 = state[ 7]; + uint s08 = state[ 8]; + uint s10 = state[ 9]; + uint s12 = state[10]; + uint s14 = state[11]; + uint s09 = state[12]; + uint s11 = state[13]; + uint s13 = state[14]; + uint s15 = state[15]; + + int step = 0; + while (step < steps) + { + // STEP 1 - // Add round ant + // Add round ant - s01 ^= RCON[step & 7]; - s03 ^= (uint)(step++); + s01 ^= RCON[step & 7]; + s03 ^= (uint)(step++); - // ARXBOX layer + // ARXBOX layer - ArxBoxRound(RCON[0], ref s00, ref s01); - ArxBoxRound(RCON[1], ref s02, ref s03); - ArxBoxRound(RCON[2], ref s04, ref s05); - ArxBoxRound(RCON[3], ref s06, ref s07); - ArxBoxRound(RCON[4], ref s08, ref s09); - ArxBoxRound(RCON[5], ref s10, ref s11); - ArxBoxRound(RCON[6], ref s12, ref s13); - ArxBoxRound(RCON[7], ref s14, ref s15); + ArxBoxRound(RCON[0], ref s00, ref s01); + ArxBoxRound(RCON[1], ref s02, ref s03); + ArxBoxRound(RCON[2], ref s04, ref s05); + ArxBoxRound(RCON[3], ref s06, ref s07); + ArxBoxRound(RCON[4], ref s08, ref s09); + ArxBoxRound(RCON[5], ref s10, ref s11); + ArxBoxRound(RCON[6], ref s12, ref s13); + ArxBoxRound(RCON[7], ref s14, ref s15); - // Linear layer + // Linear layer - uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06); - uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07); + uint t0246 = ELL(s00 ^ s02 ^ s04 ^ s06); + uint t1357 = ELL(s01 ^ s03 ^ s05 ^ s07); - uint u08 = s08; - uint u09 = s09; + uint u08 = s08; + uint u09 = s09; - s08 = s02 ^ s10 ^ t1357; - s09 = s03 ^ s11 ^ t0246; - s10 = s04 ^ s12 ^ t1357; - s11 = s05 ^ s13 ^ t0246; - s12 = s06 ^ s14 ^ t1357; - s13 = s07 ^ s15 ^ t0246; - s14 = s00 ^ u08 ^ t1357; - s15 = s01 ^ u09 ^ t0246; + s08 = s02 ^ s10 ^ t1357; + s09 = s03 ^ s11 ^ t0246; + s10 = s04 ^ s12 ^ t1357; + s11 = s05 ^ s13 ^ t0246; + s12 = s06 ^ s14 ^ t1357; + s13 = s07 ^ s15 ^ t0246; + s14 = s00 ^ u08 ^ t1357; + s15 = s01 ^ u09 ^ t0246; - // STEP 2 + // STEP 2 - // Add round ant + // Add round ant - s09 ^= RCON[step & 7]; - s11 ^= (uint)(step++); + s09 ^= RCON[step & 7]; + s11 ^= (uint)(step++); - // ARXBOX layer + // ARXBOX layer - ArxBoxRound(RCON[0], ref s08, ref s09); - ArxBoxRound(RCON[1], ref s10, ref s11); - ArxBoxRound(RCON[2], ref s12, ref s13); - ArxBoxRound(RCON[3], ref s14, ref s15); - ArxBoxRound(RCON[4], ref s00, ref s01); - ArxBoxRound(RCON[5], ref s02, ref s03); - ArxBoxRound(RCON[6], ref s04, ref s05); - ArxBoxRound(RCON[7], ref s06, ref s07); + ArxBoxRound(RCON[0], ref s08, ref s09); + ArxBoxRound(RCON[1], ref s10, ref s11); + ArxBoxRound(RCON[2], ref s12, ref s13); + ArxBoxRound(RCON[3], ref s14, ref s15); + ArxBoxRound(RCON[4], ref s00, ref s01); + ArxBoxRound(RCON[5], ref s02, ref s03); + ArxBoxRound(RCON[6], ref s04, ref s05); + ArxBoxRound(RCON[7], ref s06, ref s07); - // Linear layer + // Linear layer - uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14); - uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15); + uint t8ACE = ELL(s08 ^ s10 ^ s12 ^ s14); + uint t9BDF = ELL(s09 ^ s11 ^ s13 ^ s15); - uint u00 = s00; - uint u01 = s01; + uint u00 = s00; + uint u01 = s01; - s00 = s02 ^ s10 ^ t9BDF; - s01 = s03 ^ s11 ^ t8ACE; - s02 = s04 ^ s12 ^ t9BDF; - s03 = s05 ^ s13 ^ t8ACE; - s04 = s06 ^ s14 ^ t9BDF; - s05 = s07 ^ s15 ^ t8ACE; - s06 = u00 ^ s08 ^ t9BDF; - s07 = u01 ^ s09 ^ t8ACE; - } + s00 = s02 ^ s10 ^ t9BDF; + s01 = s03 ^ s11 ^ t8ACE; + s02 = s04 ^ s12 ^ t9BDF; + s03 = s05 ^ s13 ^ t8ACE; + s04 = s06 ^ s14 ^ t9BDF; + s05 = s07 ^ s15 ^ t8ACE; + s06 = u00 ^ s08 ^ t9BDF; + s07 = u01 ^ s09 ^ t8ACE; + } - state[ 0] = s00; - state[ 1] = s01; - state[ 2] = s02; - state[ 3] = s03; - state[ 4] = s04; - state[ 5] = s05; - state[ 6] = s06; - state[ 7] = s07; - state[ 8] = s08; - state[ 9] = s09; - state[10] = s10; - state[11] = s11; - state[12] = s12; - state[13] = s13; - state[14] = s14; - state[15] = s15; + state[ 0] = s00; + state[ 1] = s02; + state[ 2] = s04; + state[ 3] = s06; + state[ 4] = s01; + state[ 5] = s03; + state[ 6] = s05; + state[ 7] = s07; + state[ 8] = s08; + state[ 9] = s10; + state[10] = s12; + state[11] = s14; + state[12] = s09; + state[13] = s11; + state[14] = s13; + state[15] = s15; + } } #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER @@ -503,5 +589,81 @@ namespace Org.BouncyCastle.Crypto.Digests { return Integers.RotateRight(x, 16) ^ (x & 0xFFFFU); } + +#if NETCOREAPP3_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ArxBoxRound(Vector128<uint> rc, ref Vector128<uint> s00, ref Vector128<uint> s01) + { + s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 31)); + s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 1)); + + s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 24)); + s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 8)); + + s00 = Sse2.Xor(s00, rc); + + s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 17)); + s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 15)); + + s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 17)); + s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 15)); + + s00 = Sse2.Xor(s00, rc); + + s00 = Sse2.Add(s00, s01); + + s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 31)); + s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 1)); + + s00 = Sse2.Xor(s00, rc); + + s00 = Sse2.Add(s00, Sse2.ShiftRightLogical(s01, 24)); + s00 = Sse2.Add(s00, Sse2.ShiftLeftLogical(s01, 8)); + + s01 = Sse2.Xor(s01, Sse2.ShiftRightLogical(s00, 16)); + s01 = Sse2.Xor(s01, Sse2.ShiftLeftLogical(s00, 16)); + + s00 = Sse2.Xor(s00, rc); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<uint> ELL(Vector128<uint> x) + { + var t = Sse2.ShiftLeftLogical(x, 16); + var u = Sse2.Xor(x, t); + return Sse2.Xor(t, Sse2.ShiftRightLogical(u, 16)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<uint> HorizontalXor(Vector128<uint> x) + { + var t = Sse2.Xor(x, Sse2.Shuffle(x, 0x1B)); + return Sse2.Xor(t, Sse2.Shuffle(t, 0xB1)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<uint> Load128(ReadOnlySpan<uint> t) + { + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16) + return MemoryMarshal.Read<Vector128<uint>>(MemoryMarshal.AsBytes(t)); + + return Vector128.Create(t[0], t[1], t[2], t[3]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store128(Vector128<uint> s, Span<uint> t) + { + var b = MemoryMarshal.AsBytes(t); + if (BitConverter.IsLittleEndian && Unsafe.SizeOf<Vector128<uint>>() == 16) + { + MemoryMarshal.Write(b, ref s); + return; + } + + var u = s.AsUInt64(); + BinaryPrimitives.WriteUInt64LittleEndian(b[..8], u.GetElement(0)); + BinaryPrimitives.WriteUInt64LittleEndian(b[8..], u.GetElement(1)); + } +#endif } } |