// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; namespace Microsoft.AspNetCore.Connections.Abstractions { internal class UrlDecoder { static bool[] IsAllowed = new bool[0x7F + 1]; /// /// Unescape a URL path /// /// The byte span represents a UTF8 encoding url path. /// The byte span where unescaped url path is copied to. /// The length of the byte sequence of the unescaped url path. public static int Decode(ReadOnlySpan source, Span destination) { if (destination.Length < source.Length) { throw new ArgumentException( "Lenghth of the destination byte span is less then the source.", nameof(destination)); } // This requires the destination span to be larger or equal to source span source.CopyTo(destination); return DecodeInPlace(destination); } /// /// Unescape a URL path in place. /// /// The byte span represents a UTF8 encoding url path. /// The number of the bytes representing the result. /// /// The unescape is done in place, which means after decoding the result is the subset of /// the input span. /// public static int DecodeInPlace(Span buffer) { // the slot to read the input var sourceIndex = 0; // the slot to write the unescaped byte var destinationIndex = 0; while (true) { if (sourceIndex == buffer.Length) { break; } if (buffer[sourceIndex] == '%') { var decodeIndex = sourceIndex; // If decoding process succeeds, the writer iterator will be moved // to the next write-ready location. On the other hand if the scanned // percent-encodings cannot be interpreted as sequence of UTF-8 octets, // these bytes should be copied to output as is. // The decodeReader iterator is always moved to the first byte not yet // be scanned after the process. A failed decoding means the chars // between the reader and decodeReader can be copied to output untouched. if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer)) { Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer); } sourceIndex = decodeIndex; } else { buffer[destinationIndex++] = buffer[sourceIndex++]; } } return destinationIndex; } /// /// Unescape the percent-encodings /// /// The iterator point to the first % char /// The place to write to /// The byte array private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span buffer) { // preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets, // bytes from this till the last scanned one will be copied to the memory pointed by writer. var byte1 = UnescapePercentEncoding(ref sourceIndex, buffer); if (byte1 == -1) { return false; } if (byte1 == 0) { throw new InvalidOperationException("The path contains null characters."); } if (byte1 <= 0x7F) { // first byte < U+007f, it is a single byte ASCII buffer[destinationIndex++] = (byte)byte1; return true; } int byte2 = 0, byte3 = 0, byte4 = 0; // anticipate more bytes var currentDecodeBits = 0; var byteCount = 1; var expectValueMin = 0; if ((byte1 & 0xE0) == 0xC0) { // 110x xxxx, expect one more byte currentDecodeBits = byte1 & 0x1F; byteCount = 2; expectValueMin = 0x80; } else if ((byte1 & 0xF0) == 0xE0) { // 1110 xxxx, expect two more bytes currentDecodeBits = byte1 & 0x0F; byteCount = 3; expectValueMin = 0x800; } else if ((byte1 & 0xF8) == 0xF0) { // 1111 0xxx, expect three more bytes currentDecodeBits = byte1 & 0x07; byteCount = 4; expectValueMin = 0x10000; } else { // invalid first byte return false; } var remainingBytes = byteCount - 1; while (remainingBytes > 0) { // read following three chars if (sourceIndex == buffer.Length) { return false; } var nextSourceIndex = sourceIndex; var nextByte = UnescapePercentEncoding(ref nextSourceIndex, buffer); if (nextByte == -1) { return false; } if ((nextByte & 0xC0) != 0x80) { // the follow up byte is not in form of 10xx xxxx return false; } currentDecodeBits = (currentDecodeBits << 6) | (nextByte & 0x3F); remainingBytes--; if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F) { // this is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that // are not allowed in UTF-8; return false; } if (remainingBytes == 2 && currentDecodeBits >= 0x110) { // this is going to be out of the upper Unicode bound 0x10FFFF. return false; } sourceIndex = nextSourceIndex; if (byteCount - remainingBytes == 2) { byte2 = nextByte; } else if (byteCount - remainingBytes == 3) { byte3 = nextByte; } else if (byteCount - remainingBytes == 4) { byte4 = nextByte; } } if (currentDecodeBits < expectValueMin) { // overlong encoding (e.g. using 2 bytes to encode something that only needed 1). return false; } // all bytes are verified, write to the output // TODO: measure later to determine if the performance of following logic can be improved // the idea is to combine the bytes into short/int and write to span directly to avoid // range check cost if (byteCount > 0) { buffer[destinationIndex++] = (byte)byte1; } if (byteCount > 1) { buffer[destinationIndex++] = (byte)byte2; } if (byteCount > 2) { buffer[destinationIndex++] = (byte)byte3; } if (byteCount > 3) { buffer[destinationIndex++] = (byte)byte4; } return true; } private static void Copy(int begin, int end, ref int writer, Span buffer) { while (begin != end) { buffer[writer++] = buffer[begin++]; } } /// /// Read the percent-encoding and try unescape it. /// /// The operation first peek at the character the /// iterator points at. If it is % the is then /// moved on to scan the following to characters. If the two following /// characters are hexadecimal literals they will be unescaped and the /// value will be returned. /// /// If the first character is not % the iterator /// will be removed beyond the location of % and -1 will be returned. /// /// If the following two characters can't be successfully unescaped the /// iterator will be move behind the % and -1 /// will be returned. /// /// The value to read /// The byte array /// The unescaped byte if success. Otherwise return -1. private static int UnescapePercentEncoding(ref int scan, Span buffer) { if (buffer[scan++] != '%') { return -1; } var probe = scan; var value1 = ReadHex(ref probe, buffer); if (value1 == -1) { return -1; } var value2 = ReadHex(ref probe, buffer); if (value2 == -1) { return -1; } if (SkipUnescape(value1, value2)) { return -1; } scan = probe; return (value1 << 4) + value2; } /// /// Read the next char and convert it into hexadecimal value. /// /// The index will be moved to the next /// byte no matter no matter whether the operation successes. /// /// The index of the byte in the buffer to read /// The byte span from which the hex to be read /// The hexadecimal value if successes, otherwise -1. private static int ReadHex(ref int scan, Span buffer) { if (scan == buffer.Length) { return -1; } var value = buffer[scan++]; var isHead = ((value >= '0') && (value <= '9')) || ((value >= 'A') && (value <= 'F')) || ((value >= 'a') && (value <= 'f')); if (!isHead) { return -1; } if (value <= '9') { return value - '0'; } else if (value <= 'F') { return (value - 'A') + 10; } else // a - f { return (value - 'a') + 10; } } private static bool SkipUnescape(int value1, int value2) { // skip %2F - '/' if (value1 == 2 && value2 == 15) { return true; } return false; } static UrlDecoder() { // Unreserved IsAllowed['A'] = true; IsAllowed['B'] = true; IsAllowed['C'] = true; IsAllowed['D'] = true; IsAllowed['E'] = true; IsAllowed['F'] = true; IsAllowed['G'] = true; IsAllowed['H'] = true; IsAllowed['I'] = true; IsAllowed['J'] = true; IsAllowed['K'] = true; IsAllowed['L'] = true; IsAllowed['M'] = true; IsAllowed['N'] = true; IsAllowed['O'] = true; IsAllowed['P'] = true; IsAllowed['Q'] = true; IsAllowed['R'] = true; IsAllowed['S'] = true; IsAllowed['T'] = true; IsAllowed['U'] = true; IsAllowed['V'] = true; IsAllowed['W'] = true; IsAllowed['X'] = true; IsAllowed['Y'] = true; IsAllowed['Z'] = true; IsAllowed['a'] = true; IsAllowed['b'] = true; IsAllowed['c'] = true; IsAllowed['d'] = true; IsAllowed['e'] = true; IsAllowed['f'] = true; IsAllowed['g'] = true; IsAllowed['h'] = true; IsAllowed['i'] = true; IsAllowed['j'] = true; IsAllowed['k'] = true; IsAllowed['l'] = true; IsAllowed['m'] = true; IsAllowed['n'] = true; IsAllowed['o'] = true; IsAllowed['p'] = true; IsAllowed['q'] = true; IsAllowed['r'] = true; IsAllowed['s'] = true; IsAllowed['t'] = true; IsAllowed['u'] = true; IsAllowed['v'] = true; IsAllowed['w'] = true; IsAllowed['x'] = true; IsAllowed['y'] = true; IsAllowed['z'] = true; IsAllowed['0'] = true; IsAllowed['1'] = true; IsAllowed['2'] = true; IsAllowed['3'] = true; IsAllowed['4'] = true; IsAllowed['5'] = true; IsAllowed['6'] = true; IsAllowed['7'] = true; IsAllowed['8'] = true; IsAllowed['9'] = true; IsAllowed['-'] = true; IsAllowed['_'] = true; IsAllowed['.'] = true; IsAllowed['~'] = true; } } }