From a29bacc171f301d9b2d79561ae105671c134c739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Wed, 26 Feb 2020 01:22:11 +0100 Subject: [PATCH] Removed usage of BMI2 pdep with SSE2 alternative (#19009) * SSE2 alternative to BMI2 ParallelBitDeposit * Codegen tuning * Keep zero vector in register * Better 64-bit BMI2 alternative * Removed BMI2 --- .../ServerInfrastructure/StringUtilities.cs | 87 ++++++++++--------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 97b2c8b083..ccadb982a0 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -24,22 +24,25 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure Debug.Assert((long)end >= Vector256.Count); + // PERF: so the JIT can reuse the zero from a register + Vector128 zero = Vector128.Zero; + if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256.Count) { - Vector256 zero = Vector256.Zero; + Vector256 avxZero = Vector256.Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); - if (!CheckBytesInAsciiRange(vector, zero)) + if (!CheckBytesInAsciiRange(vector, avxZero)) { return false; } - var tmp0 = Avx2.UnpackLow(vector, zero); - var tmp1 = Avx2.UnpackHigh(vector, zero); + var tmp0 = Avx2.UnpackLow(vector, avxZero); + var tmp1 = Avx2.UnpackHigh(vector, avxZero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); @@ -60,8 +63,6 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure if (input <= end - Vector128.Count) { - Vector128 zero = Vector128.Zero; - do { var vector = Sse2.LoadVector128(input).AsSByte(); @@ -122,11 +123,12 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } - if (Bmi2.X64.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. - ((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); - ((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); + Vector128 vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + Sse2.Store((ulong*)output, vecWide); } else { @@ -152,19 +154,7 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -181,19 +171,7 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -483,6 +461,25 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128 zero) + { + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); + } + else + { + output[0] = (char)input[0]; + output[1] = (char)input[1]; + output[2] = (char)input[2]; + output[3] = (char)input[3]; + } + } + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// compares them to the WORD buffer with machine endianness. @@ -495,11 +492,13 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } - if (Bmi2.X64.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul); + Sse2.X64.ConvertToUInt64(vecWide); } else { @@ -532,11 +531,13 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return false; } - if (Bmi2.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.IsSupported) { - // BMI2 will work regardless of the processor's endianness. + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt32(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.ParallelBitDeposit(value, 0x00FF00FFu); + Sse2.ConvertToUInt32(vecWide); } else { @@ -665,12 +666,14 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure return (((check - 0x0101010101010101L) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(int check) { const int HighBits = unchecked((int)0x80808080); return (((check - 0x01010101) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(short check) { const short HighBits = unchecked((short)0x8080);