412 lines
14 KiB
C#
412 lines
14 KiB
C#
// Licensed to the .NET Foundation under one or more agreements.
|
|
// The .NET Foundation licenses this file to you under the MIT license.
|
|
// See the LICENSE file in the project root for more information.
|
|
|
|
using System;
|
|
|
|
namespace Microsoft.AspNetCore.Connections.Abstractions
|
|
{
|
|
internal class UrlDecoder
|
|
{
|
|
static bool[] IsAllowed = new bool[0x7F + 1];
|
|
|
|
/// <summary>
|
|
/// Unescape a URL path
|
|
/// </summary>
|
|
/// <param name="source">The byte span represents a UTF8 encoding url path.</param>
|
|
/// <param name="destination">The byte span where unescaped url path is copied to.</param>
|
|
/// <returns>The length of the byte sequence of the unescaped url path.</returns>
|
|
public static int Decode(ReadOnlySpan<byte> source, Span<byte> destination)
|
|
{
|
|
if (destination.Length < source.Length)
|
|
{
|
|
throw new ArgumentException(
|
|
"Lenghth of the destination byte span is less then the source.",
|
|
nameof(destination));
|
|
}
|
|
|
|
// This requires the destination span to be larger or equal to source span
|
|
source.CopyTo(destination);
|
|
return DecodeInPlace(destination);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Unescape a URL path in place.
|
|
/// </summary>
|
|
/// <param name="buffer">The byte span represents a UTF8 encoding url path.</param>
|
|
/// <returns>The number of the bytes representing the result.</returns>
|
|
/// <remarks>
|
|
/// The unescape is done in place, which means after decoding the result is the subset of
|
|
/// the input span.
|
|
/// </remarks>
|
|
public static int DecodeInPlace(Span<byte> buffer)
|
|
{
|
|
// the slot to read the input
|
|
var sourceIndex = 0;
|
|
|
|
// the slot to write the unescaped byte
|
|
var destinationIndex = 0;
|
|
|
|
while (true)
|
|
{
|
|
if (sourceIndex == buffer.Length)
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (buffer[sourceIndex] == '%')
|
|
{
|
|
var decodeIndex = sourceIndex;
|
|
|
|
// If decoding process succeeds, the writer iterator will be moved
|
|
// to the next write-ready location. On the other hand if the scanned
|
|
// percent-encodings cannot be interpreted as sequence of UTF-8 octets,
|
|
// these bytes should be copied to output as is.
|
|
// The decodeReader iterator is always moved to the first byte not yet
|
|
// be scanned after the process. A failed decoding means the chars
|
|
// between the reader and decodeReader can be copied to output untouched.
|
|
if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer))
|
|
{
|
|
Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer);
|
|
}
|
|
|
|
sourceIndex = decodeIndex;
|
|
}
|
|
else
|
|
{
|
|
buffer[destinationIndex++] = buffer[sourceIndex++];
|
|
}
|
|
}
|
|
|
|
return destinationIndex;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Unescape the percent-encodings
|
|
/// </summary>
|
|
/// <param name="sourceIndex">The iterator point to the first % char</param>
|
|
/// <param name="destinationIndex">The place to write to</param>
|
|
/// <param name="buffer">The byte array</param>
|
|
private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span<byte> buffer)
|
|
{
|
|
// preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets,
|
|
// bytes from this till the last scanned one will be copied to the memory pointed by writer.
|
|
var byte1 = UnescapePercentEncoding(ref sourceIndex, buffer);
|
|
if (byte1 == -1)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (byte1 == 0)
|
|
{
|
|
throw new InvalidOperationException("The path contains null characters.");
|
|
}
|
|
|
|
if (byte1 <= 0x7F)
|
|
{
|
|
// first byte < U+007f, it is a single byte ASCII
|
|
buffer[destinationIndex++] = (byte)byte1;
|
|
return true;
|
|
}
|
|
|
|
int byte2 = 0, byte3 = 0, byte4 = 0;
|
|
|
|
// anticipate more bytes
|
|
var currentDecodeBits = 0;
|
|
var byteCount = 1;
|
|
var expectValueMin = 0;
|
|
if ((byte1 & 0xE0) == 0xC0)
|
|
{
|
|
// 110x xxxx, expect one more byte
|
|
currentDecodeBits = byte1 & 0x1F;
|
|
byteCount = 2;
|
|
expectValueMin = 0x80;
|
|
}
|
|
else if ((byte1 & 0xF0) == 0xE0)
|
|
{
|
|
// 1110 xxxx, expect two more bytes
|
|
currentDecodeBits = byte1 & 0x0F;
|
|
byteCount = 3;
|
|
expectValueMin = 0x800;
|
|
}
|
|
else if ((byte1 & 0xF8) == 0xF0)
|
|
{
|
|
// 1111 0xxx, expect three more bytes
|
|
currentDecodeBits = byte1 & 0x07;
|
|
byteCount = 4;
|
|
expectValueMin = 0x10000;
|
|
}
|
|
else
|
|
{
|
|
// invalid first byte
|
|
return false;
|
|
}
|
|
|
|
var remainingBytes = byteCount - 1;
|
|
while (remainingBytes > 0)
|
|
{
|
|
// read following three chars
|
|
if (sourceIndex == buffer.Length)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
var nextSourceIndex = sourceIndex;
|
|
var nextByte = UnescapePercentEncoding(ref nextSourceIndex, buffer);
|
|
if (nextByte == -1)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if ((nextByte & 0xC0) != 0x80)
|
|
{
|
|
// the follow up byte is not in form of 10xx xxxx
|
|
return false;
|
|
}
|
|
|
|
currentDecodeBits = (currentDecodeBits << 6) | (nextByte & 0x3F);
|
|
remainingBytes--;
|
|
|
|
if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
|
|
{
|
|
// this is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that
|
|
// are not allowed in UTF-8;
|
|
return false;
|
|
}
|
|
|
|
if (remainingBytes == 2 && currentDecodeBits >= 0x110)
|
|
{
|
|
// this is going to be out of the upper Unicode bound 0x10FFFF.
|
|
return false;
|
|
}
|
|
|
|
sourceIndex = nextSourceIndex;
|
|
if (byteCount - remainingBytes == 2)
|
|
{
|
|
byte2 = nextByte;
|
|
}
|
|
else if (byteCount - remainingBytes == 3)
|
|
{
|
|
byte3 = nextByte;
|
|
}
|
|
else if (byteCount - remainingBytes == 4)
|
|
{
|
|
byte4 = nextByte;
|
|
}
|
|
}
|
|
|
|
if (currentDecodeBits < expectValueMin)
|
|
{
|
|
// overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
|
|
return false;
|
|
}
|
|
|
|
// all bytes are verified, write to the output
|
|
// TODO: measure later to determine if the performance of following logic can be improved
|
|
// the idea is to combine the bytes into short/int and write to span directly to avoid
|
|
// range check cost
|
|
if (byteCount > 0)
|
|
{
|
|
buffer[destinationIndex++] = (byte)byte1;
|
|
}
|
|
if (byteCount > 1)
|
|
{
|
|
buffer[destinationIndex++] = (byte)byte2;
|
|
}
|
|
if (byteCount > 2)
|
|
{
|
|
buffer[destinationIndex++] = (byte)byte3;
|
|
}
|
|
if (byteCount > 3)
|
|
{
|
|
buffer[destinationIndex++] = (byte)byte4;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private static void Copy(int begin, int end, ref int writer, Span<byte> buffer)
|
|
{
|
|
while (begin != end)
|
|
{
|
|
buffer[writer++] = buffer[begin++];
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Read the percent-encoding and try unescape it.
|
|
///
|
|
/// The operation first peek at the character the <paramref name="scan"/>
|
|
/// iterator points at. If it is % the <paramref name="scan"/> is then
|
|
/// moved on to scan the following to characters. If the two following
|
|
/// characters are hexadecimal literals they will be unescaped and the
|
|
/// value will be returned.
|
|
///
|
|
/// If the first character is not % the <paramref name="scan"/> iterator
|
|
/// will be removed beyond the location of % and -1 will be returned.
|
|
///
|
|
/// If the following two characters can't be successfully unescaped the
|
|
/// <paramref name="scan"/> iterator will be move behind the % and -1
|
|
/// will be returned.
|
|
/// </summary>
|
|
/// <param name="scan">The value to read</param>
|
|
/// <param name="buffer">The byte array</param>
|
|
/// <returns>The unescaped byte if success. Otherwise return -1.</returns>
|
|
private static int UnescapePercentEncoding(ref int scan, Span<byte> buffer)
|
|
{
|
|
if (buffer[scan++] != '%')
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
var probe = scan;
|
|
|
|
var value1 = ReadHex(ref probe, buffer);
|
|
if (value1 == -1)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
var value2 = ReadHex(ref probe, buffer);
|
|
if (value2 == -1)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
if (SkipUnescape(value1, value2))
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
scan = probe;
|
|
return (value1 << 4) + value2;
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Read the next char and convert it into hexadecimal value.
|
|
///
|
|
/// The <paramref name="scan"/> index will be moved to the next
|
|
/// byte no matter no matter whether the operation successes.
|
|
/// </summary>
|
|
/// <param name="scan">The index of the byte in the buffer to read</param>
|
|
/// <param name="buffer">The byte span from which the hex to be read</param>
|
|
/// <returns>The hexadecimal value if successes, otherwise -1.</returns>
|
|
private static int ReadHex(ref int scan, Span<byte> buffer)
|
|
{
|
|
if (scan == buffer.Length)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
var value = buffer[scan++];
|
|
var isHead = ((value >= '0') && (value <= '9')) ||
|
|
((value >= 'A') && (value <= 'F')) ||
|
|
((value >= 'a') && (value <= 'f'));
|
|
|
|
if (!isHead)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
if (value <= '9')
|
|
{
|
|
return value - '0';
|
|
}
|
|
else if (value <= 'F')
|
|
{
|
|
return (value - 'A') + 10;
|
|
}
|
|
else // a - f
|
|
{
|
|
return (value - 'a') + 10;
|
|
}
|
|
}
|
|
|
|
private static bool SkipUnescape(int value1, int value2)
|
|
{
|
|
// skip %2F - '/'
|
|
if (value1 == 2 && value2 == 15)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static UrlDecoder()
|
|
{
|
|
// Unreserved
|
|
IsAllowed['A'] = true;
|
|
IsAllowed['B'] = true;
|
|
IsAllowed['C'] = true;
|
|
IsAllowed['D'] = true;
|
|
IsAllowed['E'] = true;
|
|
IsAllowed['F'] = true;
|
|
IsAllowed['G'] = true;
|
|
IsAllowed['H'] = true;
|
|
IsAllowed['I'] = true;
|
|
IsAllowed['J'] = true;
|
|
IsAllowed['K'] = true;
|
|
IsAllowed['L'] = true;
|
|
IsAllowed['M'] = true;
|
|
IsAllowed['N'] = true;
|
|
IsAllowed['O'] = true;
|
|
IsAllowed['P'] = true;
|
|
IsAllowed['Q'] = true;
|
|
IsAllowed['R'] = true;
|
|
IsAllowed['S'] = true;
|
|
IsAllowed['T'] = true;
|
|
IsAllowed['U'] = true;
|
|
IsAllowed['V'] = true;
|
|
IsAllowed['W'] = true;
|
|
IsAllowed['X'] = true;
|
|
IsAllowed['Y'] = true;
|
|
IsAllowed['Z'] = true;
|
|
|
|
IsAllowed['a'] = true;
|
|
IsAllowed['b'] = true;
|
|
IsAllowed['c'] = true;
|
|
IsAllowed['d'] = true;
|
|
IsAllowed['e'] = true;
|
|
IsAllowed['f'] = true;
|
|
IsAllowed['g'] = true;
|
|
IsAllowed['h'] = true;
|
|
IsAllowed['i'] = true;
|
|
IsAllowed['j'] = true;
|
|
IsAllowed['k'] = true;
|
|
IsAllowed['l'] = true;
|
|
IsAllowed['m'] = true;
|
|
IsAllowed['n'] = true;
|
|
IsAllowed['o'] = true;
|
|
IsAllowed['p'] = true;
|
|
IsAllowed['q'] = true;
|
|
IsAllowed['r'] = true;
|
|
IsAllowed['s'] = true;
|
|
IsAllowed['t'] = true;
|
|
IsAllowed['u'] = true;
|
|
IsAllowed['v'] = true;
|
|
IsAllowed['w'] = true;
|
|
IsAllowed['x'] = true;
|
|
IsAllowed['y'] = true;
|
|
IsAllowed['z'] = true;
|
|
|
|
IsAllowed['0'] = true;
|
|
IsAllowed['1'] = true;
|
|
IsAllowed['2'] = true;
|
|
IsAllowed['3'] = true;
|
|
IsAllowed['4'] = true;
|
|
IsAllowed['5'] = true;
|
|
IsAllowed['6'] = true;
|
|
IsAllowed['7'] = true;
|
|
IsAllowed['8'] = true;
|
|
IsAllowed['9'] = true;
|
|
|
|
IsAllowed['-'] = true;
|
|
IsAllowed['_'] = true;
|
|
IsAllowed['.'] = true;
|
|
IsAllowed['~'] = true;
|
|
}
|
|
}
|
|
}
|