From 677feb1f2d6f7bf941a7423ae3ebd9c9d66a90ca Mon Sep 17 00:00:00 2001 From: Peter Dettman Date: Sat, 22 Oct 2022 00:49:49 +0700 Subject: Salsa20 perf. opts. --- crypto/src/crypto/engines/Salsa20Engine.cs | 153 ++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 36 deletions(-) diff --git a/crypto/src/crypto/engines/Salsa20Engine.cs b/crypto/src/crypto/engines/Salsa20Engine.cs index c3e44f645..fa5477ed5 100644 --- a/crypto/src/crypto/engines/Salsa20Engine.cs +++ b/crypto/src/crypto/engines/Salsa20Engine.cs @@ -1,5 +1,12 @@ using System; -using System.Text; +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER +using System.Runtime.CompilerServices; +#endif +#if NETCOREAPP3_0_OR_GREATER +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using Org.BouncyCastle.Crypto.Parameters; using Org.BouncyCastle.Crypto.Utilities; @@ -254,7 +261,71 @@ namespace Org.BouncyCastle.Crypto.Engines if (rounds % 2 != 0) throw new ArgumentException("Number of rounds must be even"); - uint x00 = input[ 0]; +#if NETCOREAPP3_0_OR_GREATER + if (Sse41.IsSupported && BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 16) + { + Vector128 b0, b1, b2, b3; + { + var I = MemoryMarshal.Cast(input.AsSpan(0, 16)); + var t0 = MemoryMarshal.Read>(I[0x00..0x10]); + var t1 = MemoryMarshal.Read>(I[0x10..0x20]); + var t2 = MemoryMarshal.Read>(I[0x20..0x30]); + var t3 = MemoryMarshal.Read>(I[0x30..0x40]); + + var u0 = Sse41.Blend(t0, t2, 0xF0); + var u1 = Sse41.Blend(t1, t3, 0xC3); + var u2 = Sse41.Blend(t0, t2, 0x0F); + var u3 = Sse41.Blend(t1, t3, 0x3C); + + b0 = Sse41.Blend(u0, u1, 0xCC).AsUInt32(); + b1 = Sse41.Blend(u0, u1, 0x33).AsUInt32(); + b2 = Sse41.Blend(u2, u3, 0xCC).AsUInt32(); + b3 = Sse41.Blend(u2, u3, 0x33).AsUInt32(); + } + + var c0 = b0; + var c1 = b1; + var c2 = b2; + var c3 = b3; + + for (int i = rounds; i > 0; i -= 2) + { + QuarterRound_Sse2(ref c0, ref c3, ref c2, ref c1); + QuarterRound_Sse2(ref c0, ref c1, ref c2, ref c3); + } + + b0 = Sse2.Add(b0, c0); + b1 = Sse2.Add(b1, c1); + b2 = Sse2.Add(b2, c2); + b3 = Sse2.Add(b3, c3); + + { + var t0 = b0.AsUInt16(); + var t1 = b1.AsUInt16(); + var t2 = b2.AsUInt16(); + var t3 = b3.AsUInt16(); + + var u0 = Sse41.Blend(t0, t1, 0xCC); + var u1 = Sse41.Blend(t0, t1, 0x33); + var u2 = Sse41.Blend(t2, t3, 0xCC); + var u3 = Sse41.Blend(t2, t3, 0x33); + + var v0 = Sse41.Blend(u0, u2, 0xF0); + var v1 = Sse41.Blend(u1, u3, 0xC3); + var v2 = Sse41.Blend(u0, u2, 0x0F); + var v3 = Sse41.Blend(u1, u3, 0x3C); + + var X = MemoryMarshal.Cast(x.AsSpan(0, 16)); + MemoryMarshal.Write(X[0x00..0x10], ref v0); + MemoryMarshal.Write(X[0x10..0x20], ref v1); + MemoryMarshal.Write(X[0x20..0x30], ref v2); + MemoryMarshal.Write(X[0x30..0x40], ref v3); + } + return; + } +#endif + + uint x00 = input[ 0]; uint x01 = input[ 1]; uint x02 = input[ 2]; uint x03 = input[ 3]; @@ -273,39 +344,15 @@ namespace Org.BouncyCastle.Crypto.Engines for (int i = rounds; i > 0; i -= 2) { - x04 ^= Integers.RotateLeft((x00+x12), 7); - x08 ^= Integers.RotateLeft((x04+x00), 9); - x12 ^= Integers.RotateLeft((x08+x04),13); - x00 ^= Integers.RotateLeft((x12+x08),18); - x09 ^= Integers.RotateLeft((x05+x01), 7); - x13 ^= Integers.RotateLeft((x09+x05), 9); - x01 ^= Integers.RotateLeft((x13+x09),13); - x05 ^= Integers.RotateLeft((x01+x13),18); - x14 ^= Integers.RotateLeft((x10+x06), 7); - x02 ^= Integers.RotateLeft((x14+x10), 9); - x06 ^= Integers.RotateLeft((x02+x14),13); - x10 ^= Integers.RotateLeft((x06+x02),18); - x03 ^= Integers.RotateLeft((x15+x11), 7); - x07 ^= Integers.RotateLeft((x03+x15), 9); - x11 ^= Integers.RotateLeft((x07+x03),13); - x15 ^= Integers.RotateLeft((x11+x07),18); - - x01 ^= Integers.RotateLeft((x00+x03), 7); - x02 ^= Integers.RotateLeft((x01+x00), 9); - x03 ^= Integers.RotateLeft((x02+x01),13); - x00 ^= Integers.RotateLeft((x03+x02),18); - x06 ^= Integers.RotateLeft((x05+x04), 7); - x07 ^= Integers.RotateLeft((x06+x05), 9); - x04 ^= Integers.RotateLeft((x07+x06),13); - x05 ^= Integers.RotateLeft((x04+x07),18); - x11 ^= Integers.RotateLeft((x10+x09), 7); - x08 ^= Integers.RotateLeft((x11+x10), 9); - x09 ^= Integers.RotateLeft((x08+x11),13); - x10 ^= Integers.RotateLeft((x09+x08),18); - x12 ^= Integers.RotateLeft((x15+x14), 7); - x13 ^= Integers.RotateLeft((x12+x15), 9); - x14 ^= Integers.RotateLeft((x13+x12),13); - x15 ^= Integers.RotateLeft((x14+x13),18); + QuarterRound(ref x00, ref x04, ref x08, ref x12); + QuarterRound(ref x05, ref x09, ref x13, ref x01); + QuarterRound(ref x10, ref x14, ref x02, ref x06); + QuarterRound(ref x15, ref x03, ref x07, ref x11); + + QuarterRound(ref x00, ref x01, ref x02, ref x03); + QuarterRound(ref x05, ref x06, ref x07, ref x04); + QuarterRound(ref x10, ref x11, ref x08, ref x09); + QuarterRound(ref x15, ref x12, ref x13, ref x14); } x[ 0] = x00 + input[ 0]; @@ -364,5 +411,39 @@ namespace Org.BouncyCastle.Crypto.Engines return false; } - } + +#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] +#endif + private static void QuarterRound(ref uint a, ref uint b, ref uint c, ref uint d) + { + b ^= Integers.RotateLeft(a + d, 7); + c ^= Integers.RotateLeft(b + a, 9); + d ^= Integers.RotateLeft(c + b, 13); + a ^= Integers.RotateLeft(d + c, 18); + } + +#if NETCOREAPP3_0_OR_GREATER + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void QuarterRound_Sse2(ref Vector128 a, ref Vector128 b, ref Vector128 c, + ref Vector128 d) + { + b = Sse2.Xor(b, Rotate_Sse2(Sse2.Add(a, d), 7)); + c = Sse2.Xor(c, Rotate_Sse2(Sse2.Add(b, a), 9)); + d = Sse2.Xor(d, Rotate_Sse2(Sse2.Add(c, b), 13)); + a = Sse2.Xor(a, Rotate_Sse2(Sse2.Add(d, c), 18)); + + b = Sse2.Shuffle(b, 0x93); + c = Sse2.Shuffle(c, 0x4E); + d = Sse2.Shuffle(d, 0x39); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Rotate_Sse2(Vector128 x, byte sl) + { + byte sr = (byte)(32 - sl); + return Sse2.Xor(Sse2.ShiftLeftLogical(x, sl), Sse2.ShiftRightLogical(x, sr)); + } +#endif + } } -- cgit 1.4.1