Quickly fail the UFT-8 validation if the first byte is already invalid

2017-04-08 20:05:55 +02:00 · 2017-04-08 20:05:55 +02:00 · 4c183b4d00
parent 841ceb24b6
commit 4c183b4d00
3 changed files with 22 additions and 6 deletions
--- a/src/Microsoft.Extensions.WebSockets.Internal/Utf8Validator.cs
+++ b/src/Microsoft.Extensions.WebSockets.Internal/Utf8Validator.cs
@ -12,6 +12,9 @@ namespace Microsoft.Extensions.WebSockets.Internal
    public class Utf8Validator
    {
        // Table of UTF-8 code point widths. '0' indicates an invalid first byte.
+        // 0x80 - 0xBF are the continuation bytes and invalid as first byte.
+        // 0xC0 - 0xC1 are overlong encodings of ASCII characters
+        // 0xF5 - 0xFF encode numbers that are larger than the Unicode limit (0x10FFFF)
        private static readonly byte[] _utf8Width = new byte[256]
        {
            /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x0F */
@ -26,10 +29,10 @@ namespace Microsoft.Extensions.WebSockets.Internal
            /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x9F */
            /* 0xA0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xAF */
            /* 0xB0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xBF */
-            /* 0xC0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xCF */
+            /* 0xC0 */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xCF */
            /* 0xD0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xDF */
            /* 0xE0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xEF */
-            /* 0xF0 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, /* 0xFF */
+            /* 0xF0 */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xFF */
        };

        // Table of masks used to extract the code point bits from the first byte. Indexed by (width - 1)
--- a/test/Microsoft.Extensions.WebSockets.Internal.Tests/Utf8ValidatorTests.cs
+++ b/test/Microsoft.Extensions.WebSockets.Internal.Tests/Utf8ValidatorTests.cs
@ -90,12 +90,27 @@ namespace Microsoft.Extensions.WebSockets.Internal.Tests

        // '\u0800' (3 byte char) encoded with 4 bytes
        [InlineData(new byte[] { 0xF0, 0x80, 0xA0, 0x80 })]
+
+        // Code point larger than what is allowed
+        [InlineData(new byte[] { 0xF5, 0x80, 0x80, 0x80 })]
        public void InvalidSingleFramePayloads(byte[] payload)
        {
            var validator = new Utf8Validator();
            Assert.False(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload), fin: true));
        }

+        [Theory]
+
+        [InlineData(new byte[] { 0xC0 })] // overlong encoding of ASCII
+        [InlineData(new byte[] { 0xC1 })] // overlong encoding of ASCII
+        [InlineData(new byte[] { 0xF5 })] // larger than the unicode limit
+        public void InvalidMultiByteSequencesByFirstByte(byte[] payload)
+        {
+            var validator = new Utf8Validator();
+            Assert.False(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload), fin: false));
+        }
+
+
        [Theory]

        // Continuation byte as first byte of code point
@ -113,8 +128,7 @@ namespace Microsoft.Extensions.WebSockets.Internal.Tests

        // Overlong Encoding

-        // 'H' (1 byte char) encoded with 2, 3 and 4 bytes
-        [InlineData(new byte[] { 0xC1 }, new byte[] { 0x88 })]
+        // 'H' (1 byte char) encoded with 3 and 4 bytes
        [InlineData(new byte[] { 0xE0 }, new byte[] { 0x81, 0x88 })]
        [InlineData(new byte[] { 0xF0 }, new byte[] { 0x80, 0x81, 0x88 })]

--- a/test/Microsoft.Extensions.WebSockets.Internal.Tests/WebSocketConnectionTests.Utf8Validation.cs
+++ b/test/Microsoft.Extensions.WebSockets.Internal.Tests/WebSocketConnectionTests.Utf8Validation.cs
@ -175,8 +175,7 @@ namespace Microsoft.Extensions.WebSockets.Internal.Tests

        // Overlong Encoding

-        // 'H' (1 byte char) encoded with 2, 3 and 4 bytes
-        [InlineData(new byte[] { 0xC1 }, new byte[] { 0x88 })]
+        // 'H' (1 byte char) encoded with 3 and 4 bytes
        [InlineData(new byte[] { 0xE0 }, new byte[] { 0x81, 0x88 })]
        [InlineData(new byte[] { 0xF0 }, new byte[] { 0x80, 0x81, 0x88 })]