#13 - Detailed UTF-8 validation.
This commit is contained in:
parent
ae77def939
commit
98e9285fa8
|
|
@ -70,49 +70,52 @@ namespace Microsoft.AspNet.WebSockets.Protocol
|
|||
}
|
||||
}
|
||||
|
||||
// For now this is stateless and does not handle sequences spliced across messages.
|
||||
// http://etutorials.org/Programming/secure+programming/Chapter+3.+Input+Validation/3.12+Detecting+Illegal+UTF-8+Characters/
|
||||
// Performs a stateful validation of UTF-8 bytes.
|
||||
// It checks for valid formatting, overlong encodings, surrogates, and value ranges.
|
||||
public static bool TryValidateUtf8(ArraySegment<byte> arraySegment, bool endOfMessage, Utf8MessageState state)
|
||||
{
|
||||
for (int i = arraySegment.Offset; i < arraySegment.Offset + arraySegment.Count; )
|
||||
{
|
||||
// Have we started a character sequence yet?
|
||||
if (!state.SequenceInProgress)
|
||||
{
|
||||
// The first byte tells us how many bytes are in the sequence.
|
||||
state.SequenceInProgress = true;
|
||||
byte b = arraySegment.Array[i];
|
||||
i++;
|
||||
if ((b & 0x80) == 0) // 0bbbbbbb, single byte
|
||||
{
|
||||
state.AdditionalBytesExpected = 0;
|
||||
state.CurrentDecodeBits = b & 0x7F;
|
||||
state.ExpectedValueMin = 0;
|
||||
}
|
||||
else if ((b & 0xC0) == 0x80)
|
||||
{
|
||||
return false; // Misplaced 10bbbbbb byte. This cannot be the first byte.
|
||||
// Misplaced 10bbbbbb continuation byte. This cannot be the first byte.
|
||||
return false;
|
||||
}
|
||||
else if ((b & 0xE0) == 0xC0) // 110bbbbb 10bbbbbb
|
||||
{
|
||||
state.AdditionalBytesExpected = 1;
|
||||
state.CurrentDecodeBits = b & 0x1F;
|
||||
state.ExpectedValueMin = 0x80;
|
||||
}
|
||||
else if ((b & 0xF0) == 0xE0) // 1110bbbb 10bbbbbb 10bbbbbb
|
||||
{
|
||||
state.AdditionalBytesExpected = 2;
|
||||
state.CurrentDecodeBits = b & 0xF;
|
||||
state.ExpectedValueMin = 0x800;
|
||||
}
|
||||
else if ((b & 0xF8) == 0xF0) // 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||||
{
|
||||
state.AdditionalBytesExpected = 3;
|
||||
state.CurrentDecodeBits = b & 0x7;
|
||||
state.ExpectedValueMin = 0x10000;
|
||||
}
|
||||
else if ((b & 0xFC) == 0xF8) // 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||||
{
|
||||
state.AdditionalBytesExpected = 4;
|
||||
}
|
||||
else if ((b & 0xFE) == 0xFC) // 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
|
||||
{
|
||||
state.AdditionalBytesExpected = 5;
|
||||
}
|
||||
else // 11111110 && 11111111 are not valid
|
||||
else // 111110bb & 1111110b & 11111110 && 11111111 are not valid
|
||||
{
|
||||
return false;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
while (state.AdditionalBytesExpected > 0 && i < arraySegment.Offset + arraySegment.Count)
|
||||
{
|
||||
|
|
@ -121,12 +124,32 @@ namespace Microsoft.AspNet.WebSockets.Protocol
|
|||
{
|
||||
return false;
|
||||
}
|
||||
state.AdditionalBytesExpected--;
|
||||
|
||||
i++;
|
||||
state.AdditionalBytesExpected--;
|
||||
|
||||
// Each continuation byte carries 6 bits of data 0x10bbbbbb.
|
||||
state.CurrentDecodeBits = (state.CurrentDecodeBits << 6) | b & 0x3F;
|
||||
|
||||
if (state.AdditionalBytesExpected == 1 && state.CurrentDecodeBits >= 0x360 && state.CurrentDecodeBits <= 0x37F)
|
||||
{
|
||||
// This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8;
|
||||
return false;
|
||||
}
|
||||
if (state.AdditionalBytesExpected == 2 && state.CurrentDecodeBits >= 0x110)
|
||||
{
|
||||
// This is going to be out of the upper Unicode bound 0x10FFFF.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (state.AdditionalBytesExpected == 0)
|
||||
{
|
||||
state.SequenceInProgress = false;
|
||||
if (state.CurrentDecodeBits < state.ExpectedValueMin)
|
||||
{
|
||||
// Overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (endOfMessage && state.SequenceInProgress)
|
||||
|
|
@ -140,6 +163,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol
|
|||
{
|
||||
public bool SequenceInProgress { get; set; }
|
||||
public int AdditionalBytesExpected { get; set; }
|
||||
public int ExpectedValueMin { get; set; }
|
||||
public int CurrentDecodeBits { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test
|
|||
[InlineData(new byte[] { })]
|
||||
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64 })] // Hello World
|
||||
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2D, 0xC2, 0xB5, 0x40, 0xC3, 0x9F, 0xC3, 0xB6, 0xC3, 0xA4, 0xC3, 0xBC, 0xC3, 0xA0, 0xC3, 0xA1 })] // "Hello-µ@ßöäüàá";
|
||||
// [InlineData(new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xf0, 0xa4, 0xad, 0xa2, 0x77, 0x6f, 0x72, 0x6c, 0x64 })] // "hello\U00024b62world"
|
||||
[InlineData(new byte[] { 0xf0, 0xa4, 0xad, 0xa2 })] // "\U00024b62"
|
||||
public void ValidateSingleValidSegments_Valid(byte[] data)
|
||||
{
|
||||
var state = new Utilities.Utf8MessageState();
|
||||
|
|
@ -35,28 +37,39 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test
|
|||
[InlineData(new byte[] { 0xfe })]
|
||||
[InlineData(new byte[] { 0xff })]
|
||||
[InlineData(new byte[] { 0xfe, 0xfe, 0xff, 0xff })]
|
||||
// [InlineData(new byte[] { 0xc0, 0xaf })]
|
||||
// [InlineData(new byte[] { 0xe0, 0x80, 0xaf })]
|
||||
// [InlineData(new byte[] { 0xf4, 0x90, 0x80, 0x80 })]
|
||||
// [InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })]
|
||||
// [InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })]
|
||||
// [InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })]
|
||||
// [InlineData(new byte[] { 0xc1, 0xbf })]
|
||||
// [InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
|
||||
[InlineData(new byte[] { 0xc0, 0xb1 })] // Overlong Ascii
|
||||
[InlineData(new byte[] { 0xc1, 0xb1 })] // Overlong Ascii
|
||||
[InlineData(new byte[] { 0xe0, 0x80, 0xaf })] // Overlong
|
||||
[InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })] // Overlong
|
||||
[InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })] // Overlong
|
||||
[InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })] // Overlong
|
||||
[InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
|
||||
public void ValidateSingleInvalidSegment_Invalid(byte[] data)
|
||||
{
|
||||
var state = new Utilities.Utf8MessageState();
|
||||
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data), endOfMessage: true, state: state));
|
||||
}
|
||||
/*
|
||||
[Theory]
|
||||
// [InlineData(true, new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 }, false, new byte[] { 0x90 }, true, new byte[] { })]
|
||||
public void ValidateMultipleInvalidSegments_Invalid(bool valid1, byte[] data1, bool valid2, byte[] data2, bool valid3, byte[] data3)
|
||||
|
||||
[Fact]
|
||||
public void ValidateIndividualInvalidSegments_Invalid()
|
||||
{
|
||||
var data = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 };
|
||||
var state = new Utilities.Utf8MessageState();
|
||||
Assert.True(valid1 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state), "1st");
|
||||
Assert.True(valid2 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data2), endOfMessage: false, state: state), "2nd");
|
||||
Assert.True(valid3 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data3), endOfMessage: true, state: state), "3rd");
|
||||
}*/
|
||||
for (int i = 0; i < 12; i++)
|
||||
{
|
||||
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, i, 1), endOfMessage: false, state: state), i.ToString());
|
||||
}
|
||||
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, 12, 1), endOfMessage: false, state: state), 12.ToString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateMultipleInvalidSegments_Invalid()
|
||||
{
|
||||
var data0 = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 };
|
||||
var data1 = new byte[] { 0x90 };
|
||||
var state = new Utilities.Utf8MessageState();
|
||||
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data0), endOfMessage: false, state: state));
|
||||
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state));
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue