From 98e9285fa8b2ab00abe81374466d8f1a8c0fa9a3 Mon Sep 17 00:00:00 2001 From: Chris Ross Date: Thu, 23 Oct 2014 17:34:26 -0700 Subject: [PATCH] #13 - Detailed UTF-8 validation. --- .../Utilities.cs | 53 ++++++++++++++----- .../Utf8ValidationTests.cs | 45 ++++++++++------ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/src/Microsoft.AspNet.WebSockets.Protocol/Utilities.cs b/src/Microsoft.AspNet.WebSockets.Protocol/Utilities.cs index a5a27c8299..6531bea1a5 100644 --- a/src/Microsoft.AspNet.WebSockets.Protocol/Utilities.cs +++ b/src/Microsoft.AspNet.WebSockets.Protocol/Utilities.cs @@ -70,49 +70,52 @@ namespace Microsoft.AspNet.WebSockets.Protocol } } - // For now this is stateless and does not handle sequences spliced across messages. - // http://etutorials.org/Programming/secure+programming/Chapter+3.+Input+Validation/3.12+Detecting+Illegal+UTF-8+Characters/ + // Performs a stateful validation of UTF-8 bytes. + // It checks for valid formatting, overlong encodings, surrogates, and value ranges. public static bool TryValidateUtf8(ArraySegment arraySegment, bool endOfMessage, Utf8MessageState state) { for (int i = arraySegment.Offset; i < arraySegment.Offset + arraySegment.Count; ) { + // Have we started a character sequence yet? if (!state.SequenceInProgress) { + // The first byte tells us how many bytes are in the sequence. state.SequenceInProgress = true; byte b = arraySegment.Array[i]; + i++; if ((b & 0x80) == 0) // 0bbbbbbb, single byte { state.AdditionalBytesExpected = 0; + state.CurrentDecodeBits = b & 0x7F; + state.ExpectedValueMin = 0; } else if ((b & 0xC0) == 0x80) { - return false; // Misplaced 10bbbbbb byte. This cannot be the first byte. + // Misplaced 10bbbbbb continuation byte. This cannot be the first byte. + return false; } else if ((b & 0xE0) == 0xC0) // 110bbbbb 10bbbbbb { state.AdditionalBytesExpected = 1; + state.CurrentDecodeBits = b & 0x1F; + state.ExpectedValueMin = 0x80; } else if ((b & 0xF0) == 0xE0) // 1110bbbb 10bbbbbb 10bbbbbb { state.AdditionalBytesExpected = 2; + state.CurrentDecodeBits = b & 0xF; + state.ExpectedValueMin = 0x800; } else if ((b & 0xF8) == 0xF0) // 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb { state.AdditionalBytesExpected = 3; + state.CurrentDecodeBits = b & 0x7; + state.ExpectedValueMin = 0x10000; } - else if ((b & 0xFC) == 0xF8) // 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb - { - state.AdditionalBytesExpected = 4; - } - else if ((b & 0xFE) == 0xFC) // 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb - { - state.AdditionalBytesExpected = 5; - } - else // 11111110 && 11111111 are not valid + else // 111110bb & 1111110b & 11111110 && 11111111 are not valid { return false; } - i++; } while (state.AdditionalBytesExpected > 0 && i < arraySegment.Offset + arraySegment.Count) { @@ -121,12 +124,32 @@ namespace Microsoft.AspNet.WebSockets.Protocol { return false; } - state.AdditionalBytesExpected--; + i++; + state.AdditionalBytesExpected--; + + // Each continuation byte carries 6 bits of data 0x10bbbbbb. + state.CurrentDecodeBits = (state.CurrentDecodeBits << 6) | b & 0x3F; + + if (state.AdditionalBytesExpected == 1 && state.CurrentDecodeBits >= 0x360 && state.CurrentDecodeBits <= 0x37F) + { + // This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8; + return false; + } + if (state.AdditionalBytesExpected == 2 && state.CurrentDecodeBits >= 0x110) + { + // This is going to be out of the upper Unicode bound 0x10FFFF. + return false; + } } if (state.AdditionalBytesExpected == 0) { state.SequenceInProgress = false; + if (state.CurrentDecodeBits < state.ExpectedValueMin) + { + // Overlong encoding (e.g. using 2 bytes to encode something that only needed 1). + return false; + } } } if (endOfMessage && state.SequenceInProgress) @@ -140,6 +163,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol { public bool SequenceInProgress { get; set; } public int AdditionalBytesExpected { get; set; } + public int ExpectedValueMin { get; set; } + public int CurrentDecodeBits { get; set; } } } } diff --git a/test/Microsoft.AspNet.WebSockets.Protocol.Test/Utf8ValidationTests.cs b/test/Microsoft.AspNet.WebSockets.Protocol.Test/Utf8ValidationTests.cs index a34fa2616d..ffd990b5e2 100644 --- a/test/Microsoft.AspNet.WebSockets.Protocol.Test/Utf8ValidationTests.cs +++ b/test/Microsoft.AspNet.WebSockets.Protocol.Test/Utf8ValidationTests.cs @@ -13,6 +13,8 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test [InlineData(new byte[] { })] [InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64 })] // Hello World [InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2D, 0xC2, 0xB5, 0x40, 0xC3, 0x9F, 0xC3, 0xB6, 0xC3, 0xA4, 0xC3, 0xBC, 0xC3, 0xA0, 0xC3, 0xA1 })] // "Hello-µ@ßöäüàá"; + // [InlineData(new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xf0, 0xa4, 0xad, 0xa2, 0x77, 0x6f, 0x72, 0x6c, 0x64 })] // "hello\U00024b62world" + [InlineData(new byte[] { 0xf0, 0xa4, 0xad, 0xa2 })] // "\U00024b62" public void ValidateSingleValidSegments_Valid(byte[] data) { var state = new Utilities.Utf8MessageState(); @@ -35,28 +37,39 @@ namespace Microsoft.AspNet.WebSockets.Protocol.Test [InlineData(new byte[] { 0xfe })] [InlineData(new byte[] { 0xff })] [InlineData(new byte[] { 0xfe, 0xfe, 0xff, 0xff })] - // [InlineData(new byte[] { 0xc0, 0xaf })] - // [InlineData(new byte[] { 0xe0, 0x80, 0xaf })] - // [InlineData(new byte[] { 0xf4, 0x90, 0x80, 0x80 })] - // [InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })] - // [InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })] - // [InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })] - // [InlineData(new byte[] { 0xc1, 0xbf })] - // [InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character. + [InlineData(new byte[] { 0xc0, 0xb1 })] // Overlong Ascii + [InlineData(new byte[] { 0xc1, 0xb1 })] // Overlong Ascii + [InlineData(new byte[] { 0xe0, 0x80, 0xaf })] // Overlong + [InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })] // Overlong + [InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })] // Overlong + [InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })] // Overlong + [InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character. public void ValidateSingleInvalidSegment_Invalid(byte[] data) { var state = new Utilities.Utf8MessageState(); Assert.False(Utilities.TryValidateUtf8(new ArraySegment(data), endOfMessage: true, state: state)); } - /* - [Theory] - // [InlineData(true, new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 }, false, new byte[] { 0x90 }, true, new byte[] { })] - public void ValidateMultipleInvalidSegments_Invalid(bool valid1, byte[] data1, bool valid2, byte[] data2, bool valid3, byte[] data3) + + [Fact] + public void ValidateIndividualInvalidSegments_Invalid() { + var data = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 }; var state = new Utilities.Utf8MessageState(); - Assert.True(valid1 == Utilities.TryValidateUtf8(new ArraySegment(data1), endOfMessage: false, state: state), "1st"); - Assert.True(valid2 == Utilities.TryValidateUtf8(new ArraySegment(data2), endOfMessage: false, state: state), "2nd"); - Assert.True(valid3 == Utilities.TryValidateUtf8(new ArraySegment(data3), endOfMessage: true, state: state), "3rd"); - }*/ + for (int i = 0; i < 12; i++) + { + Assert.True(Utilities.TryValidateUtf8(new ArraySegment(data, i, 1), endOfMessage: false, state: state), i.ToString()); + } + Assert.False(Utilities.TryValidateUtf8(new ArraySegment(data, 12, 1), endOfMessage: false, state: state), 12.ToString()); + } + + [Fact] + public void ValidateMultipleInvalidSegments_Invalid() + { + var data0 = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 }; + var data1 = new byte[] { 0x90 }; + var state = new Utilities.Utf8MessageState(); + Assert.True(Utilities.TryValidateUtf8(new ArraySegment(data0), endOfMessage: false, state: state)); + Assert.False(Utilities.TryValidateUtf8(new ArraySegment(data1), endOfMessage: false, state: state)); + } } } \ No newline at end of file