Removed usage of BMI2 pdep with SSE2 alternative (#19009)

* SSE2 alternative to BMI2 ParallelBitDeposit

* Codegen tuning

* Keep zero vector in register

* Better 64-bit BMI2 alternative

* Removed BMI2
This commit is contained in:
Günther Foidl 2020-02-26 01:22:11 +01:00 committed by GitHub
parent d5cf36acc7
commit a29bacc171
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 45 additions and 42 deletions

View File

@ -24,22 +24,25 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
Debug.Assert((long)end >= Vector256<sbyte>.Count);
// PERF: so the JIT can reuse the zero from a register
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
if (Sse2.IsSupported)
{
if (Avx2.IsSupported && input <= end - Vector256<sbyte>.Count)
{
Vector256<sbyte> zero = Vector256<sbyte>.Zero;
Vector256<sbyte> avxZero = Vector256<sbyte>.Zero;
do
{
var vector = Avx.LoadVector256(input).AsSByte();
if (!CheckBytesInAsciiRange(vector, zero))
if (!CheckBytesInAsciiRange(vector, avxZero))
{
return false;
}
var tmp0 = Avx2.UnpackLow(vector, zero);
var tmp1 = Avx2.UnpackHigh(vector, zero);
var tmp0 = Avx2.UnpackLow(vector, avxZero);
var tmp1 = Avx2.UnpackHigh(vector, avxZero);
// Bring into the right order
var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20);
@ -60,8 +63,6 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
if (input <= end - Vector128<sbyte>.Count)
{
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
do
{
var vector = Sse2.LoadVector128(input).AsSByte();
@ -122,11 +123,12 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
if (Bmi2.X64.IsSupported)
// BMI2 could be used, but this variant is faster on both Intel and AMD.
if (Sse2.X64.IsSupported)
{
// BMI2 will work regardless of the processor's endianness.
((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul);
((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul);
Vector128<sbyte> vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte();
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64();
Sse2.Store((ulong*)output, vecWide);
}
else
{
@ -152,19 +154,7 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
if (Bmi2.IsSupported)
{
// BMI2 will work regardless of the processor's endianness.
((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
}
else
{
output[0] = (char)input[0];
output[1] = (char)input[1];
output[2] = (char)input[2];
output[3] = (char)input[3];
}
WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);
input += sizeof(int);
output += sizeof(int);
@ -181,19 +171,7 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
if (Bmi2.IsSupported)
{
// BMI2 will work regardless of the processor's endianness.
((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
}
else
{
output[0] = (char)input[0];
output[1] = (char)input[1];
output[2] = (char)input[2];
output[3] = (char)input[3];
}
WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);
input += sizeof(int);
output += sizeof(int);
@ -483,6 +461,25 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128<sbyte> zero)
{
// BMI2 could be used, but this variant is faster on both Intel and AMD.
if (Sse2.X64.IsSupported)
{
Vector128<sbyte> vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte();
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64();
Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide));
}
else
{
output[0] = (char)input[0];
output[1] = (char)input[1];
output[2] = (char)input[2];
output[3] = (char)input[3];
}
}
/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
/// compares them to the WORD buffer with machine endianness.
@ -495,11 +492,13 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
if (Bmi2.X64.IsSupported)
// BMI2 could be used, but this variant is faster on both Intel and AMD.
if (Sse2.X64.IsSupported)
{
// BMI2 will work regardless of the processor's endianness.
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
return Unsafe.ReadUnaligned<ulong>(ref Unsafe.As<char, byte>(ref charStart)) ==
Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul);
Sse2.X64.ConvertToUInt64(vecWide);
}
else
{
@ -532,11 +531,13 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return false;
}
if (Bmi2.IsSupported)
// BMI2 could be used, but this variant is faster on both Intel and AMD.
if (Sse2.IsSupported)
{
// BMI2 will work regardless of the processor's endianness.
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
Vector128<uint> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt32();
return Unsafe.ReadUnaligned<uint>(ref Unsafe.As<char, byte>(ref charStart)) ==
Bmi2.ParallelBitDeposit(value, 0x00FF00FFu);
Sse2.ConvertToUInt32(vecWide);
}
else
{
@ -665,12 +666,14 @@ namespace Microsoft.AspNetCore.Server.Kestrel.Core.Internal.Infrastructure
return (((check - 0x0101010101010101L) | check) & HighBits) == 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool CheckBytesInAsciiRange(int check)
{
const int HighBits = unchecked((int)0x80808080);
return (((check - 0x01010101) | check) & HighBits) == 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool CheckBytesInAsciiRange(short check)
{
const short HighBits = unchecked((short)0x8080);