aspnetcore/test/Microsoft.Extensions.WebSoc.../Utf8ValidatorTests.cs

// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.

using System.IO.Pipelines;
using System.Linq;
using System.Text;
using Xunit;

namespace Microsoft.Extensions.WebSockets.Internal.Tests
{
    public class Utf8ValidatorTests
    {
        [Theory]
        [InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }, "Hello")]
        [InlineData(new byte[] { 0xC2, 0xA7, 0x31, 0x2C, 0x20, 0x39, 0x35, 0xC2, 0xA2 }, "§1, 95¢")]
        [InlineData(new byte[] { 0xE0, 0xA0, 0x80, 0xE0, 0xA4, 0x80 }, "\u0800\u0900")]
        [InlineData(new byte[] { 0xF0, 0x90, 0x80, 0x80 }, "\U00010000")]
        public void ValidSingleFramePayloads(byte[] payload, string decoded)
        {
            var validator = new Utf8Validator();
            Assert.True(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload), fin: true));

            // Not really part of the test, but it ensures that the "decoded" string matches the "payload",
            // so that the "decoded" string can be used as a human-readable explanation of the string in question
            Assert.Equal(decoded, Encoding.UTF8.GetString(payload));
        }

        [Theory]
        [InlineData(new byte[] { 0x48, 0x65 }, new byte[] { 0x6C, 0x6C, 0x6F }, "Hello")]

        [InlineData(new byte[0], new byte[] { 0xC2, 0xA7 }, "§")]
        [InlineData(new byte[] { 0xC2 }, new byte[] { 0xA7 }, "§")]
        [InlineData(new byte[] { 0xC2, 0xA7 }, new byte[0], "§")]

        [InlineData(new byte[0], new byte[] { 0xC2, 0xA2 }, "¢")]
        [InlineData(new byte[] { 0xC2 }, new byte[] { 0xA2 }, "¢")]
        [InlineData(new byte[] { 0xC2, 0xA2 }, new byte[0], "¢")]

        [InlineData(new byte[0], new byte[] { 0xE0, 0xA0, 0x80 }, "\u0800")]
        [InlineData(new byte[] { 0xE0 }, new byte[] { 0xA0, 0x80 }, "\u0800")]
        [InlineData(new byte[] { 0xE0, 0xA0 }, new byte[] { 0x80 }, "\u0800")]
        [InlineData(new byte[] { 0xE0, 0xA0, 0x80 }, new byte[0], "\u0800")]

        [InlineData(new byte[0], new byte[] { 0xE0, 0xA4, 0x80 }, "\u0900")]
        [InlineData(new byte[] { 0xE0 }, new byte[] { 0xA4, 0x80 }, "\u0900")]
        [InlineData(new byte[] { 0xE0, 0xA4 }, new byte[] { 0x80 }, "\u0900")]
        [InlineData(new byte[] { 0xE0, 0xA4, 0x80 }, new byte[0], "\u0900")]

        [InlineData(new byte[0], new byte[] { 0xF0, 0x90, 0x80, 0x80 }, "\U00010000")]
        [InlineData(new byte[] { 0xF0 }, new byte[] { 0x90, 0x80, 0x80 }, "\U00010000")]
        [InlineData(new byte[] { 0xF0, 0x90 }, new byte[] { 0x80, 0x80 }, "\U00010000")]
        [InlineData(new byte[] { 0xF0, 0x90, 0x80 }, new byte[] { 0x80 }, "\U00010000")]
        [InlineData(new byte[] { 0xF0, 0x90, 0x80, 0x80 }, new byte[0], "\U00010000")]
        public void ValidMultiFramePayloads(byte[] payload1, byte[] payload2, string decoded)
        {
            var validator = new Utf8Validator();
            Assert.True(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload1), fin: false));
            Assert.True(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload2), fin: true));

            // Not really part of the test, but it ensures that the "decoded" string matches the "payload",
            // so that the "decoded" string can be used as a human-readable explanation of the string in question
            Assert.Equal(decoded, Encoding.UTF8.GetString(Enumerable.Concat(payload1, payload2).ToArray()));
        }

        [Theory]

        // Continuation byte as first byte of code point
        [InlineData(new byte[] { 0x48, 0x65, 0x80, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65, 0x99, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65, 0xAB, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65, 0xB0, 0x6C, 0x6F })]

        // Incomplete Code Point
        [InlineData(new byte[] { 0xC2 })]
        [InlineData(new byte[] { 0xE0 })]
        [InlineData(new byte[] { 0xE0, 0xA0 })]
        [InlineData(new byte[] { 0xE0, 0xA4 })]
        [InlineData(new byte[] { 0xF0, 0x90, 0x80 })]

        // Overlong Encoding

        // 'H' (1 byte char) encoded with 2, 3 and 4 bytes
        [InlineData(new byte[] { 0xC1, 0x88 })]
        [InlineData(new byte[] { 0xE0, 0x81, 0x88 })]
        [InlineData(new byte[] { 0xF0, 0x80, 0x81, 0x88 })]

        // '§' (2 byte char) encoded with 3 and 4 bytes
        [InlineData(new byte[] { 0xE0, 0x82, 0xA7 })]
        [InlineData(new byte[] { 0xF0, 0x80, 0x82, 0xA7 })]

        // '\u0800' (3 byte char) encoded with 4 bytes
        [InlineData(new byte[] { 0xF0, 0x80, 0xA0, 0x80 })]

        // Code point larger than what is allowed
        [InlineData(new byte[] { 0xF5, 0x80, 0x80, 0x80 })]
        public void InvalidSingleFramePayloads(byte[] payload)
        {
            var validator = new Utf8Validator();
            Assert.False(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload), fin: true));
        }

        [Theory]

        [InlineData(new byte[] { 0xC0 })] // overlong encoding of ASCII
        [InlineData(new byte[] { 0xC1 })] // overlong encoding of ASCII
        [InlineData(new byte[] { 0xF5 })] // larger than the unicode limit
        public void InvalidMultiByteSequencesByFirstByte(byte[] payload)
        {
            var validator = new Utf8Validator();
            Assert.False(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload), fin: false));
        }


        [Theory]

        // Continuation byte as first byte of code point
        [InlineData(new byte[] { 0x48, 0x65 }, new byte[] { 0x80, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65 }, new byte[] { 0x99, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65 }, new byte[] { 0xAB, 0x6C, 0x6F })]
        [InlineData(new byte[] { 0x48, 0x65 }, new byte[] { 0xB0, 0x6C, 0x6F })]

        // Incomplete Code Point
        [InlineData(new byte[] { 0xC2 }, new byte[0])]
        [InlineData(new byte[] { 0xE0 }, new byte[0])]
        [InlineData(new byte[] { 0xE0, 0xA0 }, new byte[0])]
        [InlineData(new byte[] { 0xE0, 0xA4 }, new byte[0])]
        [InlineData(new byte[] { 0xF0, 0x90, 0x80 }, new byte[0])]

        // Overlong Encoding

        // 'H' (1 byte char) encoded with 3 and 4 bytes
        [InlineData(new byte[] { 0xE0 }, new byte[] { 0x81, 0x88 })]
        [InlineData(new byte[] { 0xF0 }, new byte[] { 0x80, 0x81, 0x88 })]

        // '§' (2 byte char) encoded with 3 and 4 bytes
        [InlineData(new byte[] { 0xE0, 0x82 }, new byte[] { 0xA7 })]
        [InlineData(new byte[] { 0xF0, 0x80 }, new byte[] { 0x82, 0xA7 })]

        // '\u0800' (3 byte char) encoded with 4 bytes
        [InlineData(new byte[] { 0xF0, 0x80 }, new byte[] { 0xA0, 0x80 })]
        public void InvalidMultiFramePayloads(byte[] payload1, byte[] payload2)
        {
            var validator = new Utf8Validator();
            Assert.True(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload1), fin: false));
            Assert.False(validator.ValidateUtf8Frame(ReadableBuffer.Create(payload2), fin: true));
        }
    }
}