#13 - Detailed UTF-8 validation.

This commit is contained in:
Chris Ross 2014-10-23 17:34:26 -07:00
parent ae77def939
commit 98e9285fa8
2 changed files with 68 additions and 30 deletions

View File

@ -70,49 +70,52 @@ namespace Microsoft.AspNet.WebSockets.Protocol
}
}
// For now this is stateless and does not handle sequences spliced across messages.
// http://etutorials.org/Programming/secure+programming/Chapter+3.+Input+Validation/3.12+Detecting+Illegal+UTF-8+Characters/
// Performs a stateful validation of UTF-8 bytes.
// It checks for valid formatting, overlong encodings, surrogates, and value ranges.
public static bool TryValidateUtf8(ArraySegment<byte> arraySegment, bool endOfMessage, Utf8MessageState state)
{
for (int i = arraySegment.Offset; i < arraySegment.Offset + arraySegment.Count; )
{
// Have we started a character sequence yet?
if (!state.SequenceInProgress)
{
// The first byte tells us how many bytes are in the sequence.
state.SequenceInProgress = true;
byte b = arraySegment.Array[i];
i++;
if ((b & 0x80) == 0) // 0bbbbbbb, single byte
{
state.AdditionalBytesExpected = 0;
state.CurrentDecodeBits = b & 0x7F;
state.ExpectedValueMin = 0;
}
else if ((b & 0xC0) == 0x80)
{
return false; // Misplaced 10bbbbbb byte. This cannot be the first byte.
// Misplaced 10bbbbbb continuation byte. This cannot be the first byte.
return false;
}
else if ((b & 0xE0) == 0xC0) // 110bbbbb 10bbbbbb
{
state.AdditionalBytesExpected = 1;
state.CurrentDecodeBits = b & 0x1F;
state.ExpectedValueMin = 0x80;
}
else if ((b & 0xF0) == 0xE0) // 1110bbbb 10bbbbbb 10bbbbbb
{
state.AdditionalBytesExpected = 2;
state.CurrentDecodeBits = b & 0xF;
state.ExpectedValueMin = 0x800;
}
else if ((b & 0xF8) == 0xF0) // 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
{
state.AdditionalBytesExpected = 3;
state.CurrentDecodeBits = b & 0x7;
state.ExpectedValueMin = 0x10000;
}
else if ((b & 0xFC) == 0xF8) // 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
{
state.AdditionalBytesExpected = 4;
}
else if ((b & 0xFE) == 0xFC) // 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
{
state.AdditionalBytesExpected = 5;
}
else // 11111110 && 11111111 are not valid
else // 111110bb & 1111110b & 11111110 && 11111111 are not valid
{
return false;
}
i++;
}
while (state.AdditionalBytesExpected > 0 && i < arraySegment.Offset + arraySegment.Count)
{
@ -121,12 +124,32 @@ namespace Microsoft.AspNet.WebSockets.Protocol
{
return false;
}
state.AdditionalBytesExpected--;
i++;
state.AdditionalBytesExpected--;
// Each continuation byte carries 6 bits of data 0x10bbbbbb.
state.CurrentDecodeBits = (state.CurrentDecodeBits << 6) | b & 0x3F;
if (state.AdditionalBytesExpected == 1 && state.CurrentDecodeBits >= 0x360 && state.CurrentDecodeBits <= 0x37F)
{
// This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8;
return false;
}
if (state.AdditionalBytesExpected == 2 && state.CurrentDecodeBits >= 0x110)
{
// This is going to be out of the upper Unicode bound 0x10FFFF.
return false;
}
}
if (state.AdditionalBytesExpected == 0)
{
state.SequenceInProgress = false;
if (state.CurrentDecodeBits < state.ExpectedValueMin)
{
// Overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
return false;
}
}
}
if (endOfMessage && state.SequenceInProgress)
@ -140,6 +163,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol
{
public bool SequenceInProgress { get; set; }
public int AdditionalBytesExpected { get; set; }
public int ExpectedValueMin { get; set; }
public int CurrentDecodeBits { get; set; }
}
}
}

View File

@ -13,6 +13,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test
[InlineData(new byte[] { })]
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64 })] // Hello World
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2D, 0xC2, 0xB5, 0x40, 0xC3, 0x9F, 0xC3, 0xB6, 0xC3, 0xA4, 0xC3, 0xBC, 0xC3, 0xA0, 0xC3, 0xA1 })] // "Hello-µ@ßöäüàá";
// [InlineData(new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xf0, 0xa4, 0xad, 0xa2, 0x77, 0x6f, 0x72, 0x6c, 0x64 })] // "hello\U00024b62world"
[InlineData(new byte[] { 0xf0, 0xa4, 0xad, 0xa2 })] // "\U00024b62"
public void ValidateSingleValidSegments_Valid(byte[] data)
{
var state = new Utilities.Utf8MessageState();
@ -35,28 +37,39 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test
[InlineData(new byte[] { 0xfe })]
[InlineData(new byte[] { 0xff })]
[InlineData(new byte[] { 0xfe, 0xfe, 0xff, 0xff })]
// [InlineData(new byte[] { 0xc0, 0xaf })]
// [InlineData(new byte[] { 0xe0, 0x80, 0xaf })]
// [InlineData(new byte[] { 0xf4, 0x90, 0x80, 0x80 })]
// [InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })]
// [InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })]
// [InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })]
// [InlineData(new byte[] { 0xc1, 0xbf })]
// [InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
[InlineData(new byte[] { 0xc0, 0xb1 })] // Overlong Ascii
[InlineData(new byte[] { 0xc1, 0xb1 })] // Overlong Ascii
[InlineData(new byte[] { 0xe0, 0x80, 0xaf })] // Overlong
[InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })] // Overlong
[InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })] // Overlong
[InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })] // Overlong
[InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
public void ValidateSingleInvalidSegment_Invalid(byte[] data)
{
var state = new Utilities.Utf8MessageState();
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data), endOfMessage: true, state: state));
}
/*
[Theory]
// [InlineData(true, new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 }, false, new byte[] { 0x90 }, true, new byte[] { })]
public void ValidateMultipleInvalidSegments_Invalid(bool valid1, byte[] data1, bool valid2, byte[] data2, bool valid3, byte[] data3)
[Fact]
public void ValidateIndividualInvalidSegments_Invalid()
{
var data = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 };
var state = new Utilities.Utf8MessageState();
Assert.True(valid1 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state), "1st");
Assert.True(valid2 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data2), endOfMessage: false, state: state), "2nd");
Assert.True(valid3 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data3), endOfMessage: true, state: state), "3rd");
}*/
for (int i = 0; i < 12; i++)
{
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, i, 1), endOfMessage: false, state: state), i.ToString());
}
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, 12, 1), endOfMessage: false, state: state), 12.ToString());
}
[Fact]
public void ValidateMultipleInvalidSegments_Invalid()
{
var data0 = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 };
var data1 = new byte[] { 0x90 };
var state = new Utilities.Utf8MessageState();
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data0), endOfMessage: false, state: state));
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state));
}
}
}