Add HtmlEncoder, UrlEncoder, and JavaScriptStringEncoder
Also add interfaces for abstracting each of these Unit tests are not in yet but are coming soon
This commit is contained in:
parent
dadd9cd9f3
commit
1008e17259
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
internal static class EncoderCommon
|
||||
{
|
||||
// Gets the optimal capacity of the StringBuilder that will be used to build the output
|
||||
// given a specified number of input characters and the worst-case growth.
|
||||
public static int GetCapacityOfOutputStringBuilder(int numCharsToEncode, int worstCaseOutputCharsPerInputChar)
|
||||
{
|
||||
// We treat 32KB byte size (16k chars) as a soft upper boundary for the length of any StringBuilder
|
||||
// that we allocate. We'll try to avoid going above this boundary if we can avoid it so that we
|
||||
// don't allocate objects on the LOH.
|
||||
const int upperBound = 16 * 1024;
|
||||
|
||||
// Once we have chosen an initial value for the StringBuilder size, the StringBuilder type will
|
||||
// efficiently allocate additionally blocks if necessary.
|
||||
|
||||
if (numCharsToEncode >= upperBound)
|
||||
{
|
||||
// We know that the output will contain at least as many characters as the input, so if the
|
||||
// input length exceeds the soft upper boundary just preallocate the entire builder and hope for
|
||||
// a best-case outcome.
|
||||
return numCharsToEncode;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocate the worst-case if we can, but don't exceed the soft upper boundary.
|
||||
long worstCaseTotalChars = (long)numCharsToEncode * worstCaseOutputCharsPerInputChar;
|
||||
return (int)Math.Min(upperBound, worstCaseTotalChars);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Contains helpers for dealing with byte-hex char conversions.
|
||||
/// </summary>
|
||||
internal static class HexUtil
|
||||
{
|
||||
/// <summary>
|
||||
/// Converts a number 0 - 15 to its associated hex character '0' - 'F'.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static char IntToChar(uint i)
|
||||
{
|
||||
Debug.Assert(i < 16);
|
||||
return (i < 10) ? (char)('0' + i) : (char)('A' + (i - 10));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the integral form of this hexadecimal character.
|
||||
/// </summary>
|
||||
/// <returns>0 - 15 if the character is valid, -1 if the character is invalid.</returns>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int ParseHexCharacter(char c)
|
||||
{
|
||||
if ('0' <= c && c <= '9') { return c - '0'; }
|
||||
else if ('A' <= c && c <= 'F') { return c - 'A' + 10; }
|
||||
else if ('a' <= c && c <= 'f') { return c - 'a' + 10; }
|
||||
else { return -1; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the uppercase hex-encoded form of a byte.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static void WriteHexEncodedByte(byte b, out char firstHexChar, out char secondHexChar)
|
||||
{
|
||||
firstHexChar = IntToChar((uint)b >> 4);
|
||||
secondHexChar = IntToChar((uint)b & 0xFU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// A class which can perform HTML encoding given an allow list of characters which
|
||||
/// can be represented unencoded.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Once constructed, instances of this class are thread-safe for multiple callers.
|
||||
/// </remarks>
|
||||
public unsafe sealed class HtmlEncoder : IHtmlEncoder
|
||||
{
|
||||
// The default HtmlEncoder (Basic Latin), instantiated on demand
|
||||
private static HtmlEncoder _defaultEncoder;
|
||||
|
||||
// A bitmap of characters which are allowed to be returned unescaped.
|
||||
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
|
||||
/// </summary>
|
||||
public HtmlEncoder()
|
||||
: this(CodePointFilters.BasicLatin)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using a custom allow list of characters.
|
||||
/// </summary>
|
||||
public HtmlEncoder(params ICodePointFilter[] filters)
|
||||
{
|
||||
if (filters == null)
|
||||
{
|
||||
return; // no characters are allowed, just no-op immediately
|
||||
}
|
||||
|
||||
// Punch a hole for each allowed code point across all filters (this is an OR).
|
||||
// We don't allow supplementary (astral) characters for now.
|
||||
foreach (var filter in filters)
|
||||
{
|
||||
foreach (var codePoint in filter.GetAllowedCodePoints())
|
||||
{
|
||||
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
|
||||
{
|
||||
AllowCharacter((char)codePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Forbid characters that are special in HTML
|
||||
ForbidCharacter('<');
|
||||
ForbidCharacter('>');
|
||||
ForbidCharacter('&');
|
||||
ForbidCharacter('\''); // can be used to escape attributes
|
||||
ForbidCharacter('\"'); // can be used to escape attributes
|
||||
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
|
||||
|
||||
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
|
||||
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
|
||||
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
|
||||
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
|
||||
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
|
||||
{
|
||||
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A default instance of the HtmlEncoder, equivalent to allowing only
|
||||
/// the 'Basic Latin' character range.
|
||||
/// </summary>
|
||||
public static HtmlEncoder Default
|
||||
{
|
||||
get
|
||||
{
|
||||
HtmlEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
|
||||
if (defaultEncoder == null)
|
||||
{
|
||||
defaultEncoder = new HtmlEncoder();
|
||||
Volatile.Write(ref _defaultEncoder, defaultEncoder);
|
||||
}
|
||||
return defaultEncoder;
|
||||
}
|
||||
}
|
||||
|
||||
// Marks a character as allowed (can be returned unencoded)
|
||||
private void AllowCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] |= 0x1U << offset;
|
||||
}
|
||||
|
||||
// Marks a character as forbidden (must be returned encoded)
|
||||
private void ForbidCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] &= ~(0x1U << offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Everybody's favorite HtmlEncode routine.
|
||||
/// </summary>
|
||||
public string HtmlEncode(string value)
|
||||
{
|
||||
if (String.IsNullOrEmpty(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
// Quick check: does the string need to be encoded at all?
|
||||
// If not, just return the input string as-is.
|
||||
for (int i = 0; i < value.Length; i++)
|
||||
{
|
||||
if (!IsCharacterAllowed(value[i]))
|
||||
{
|
||||
return HtmlEncodeImpl(value, i);
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
|
||||
{
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
|
||||
|
||||
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] ""
|
||||
// We don't need to worry about astral code points since they consume *two* input chars to
|
||||
// generate at most 10 output chars (""), which equates to 5 output per input.
|
||||
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
|
||||
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
|
||||
Debug.Assert(sbCapacity >= input.Length);
|
||||
|
||||
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
|
||||
// then begin encoding from the last (potentially requiring encoding) part of the input string.
|
||||
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
|
||||
fixed (char* pInput = input)
|
||||
{
|
||||
return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
|
||||
}
|
||||
}
|
||||
|
||||
private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
|
||||
{
|
||||
while (charsRemaining != 0)
|
||||
{
|
||||
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
|
||||
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
|
||||
{
|
||||
// Supplementary characters should always be encoded numerically.
|
||||
WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
|
||||
|
||||
// We consume two UTF-16 characters for a single supplementary character.
|
||||
input += 2;
|
||||
charsRemaining -= 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, this was a BMP character.
|
||||
input++;
|
||||
charsRemaining--;
|
||||
char c = (char)nextScalar;
|
||||
if (IsCharacterAllowed(c))
|
||||
{
|
||||
builder.Append(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c == '<') { builder.Append("<"); }
|
||||
else if (c == '>') { builder.Append(">"); }
|
||||
else if (c == '&') { builder.Append("&"); }
|
||||
else if (c == '\"') { builder.Append("""); }
|
||||
else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return builder.ToString();
|
||||
}
|
||||
|
||||
// Determines whether the given character can be returned unencoded.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool IsCharacterAllowed(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
|
||||
}
|
||||
|
||||
// Writes a scalar value as "�"
|
||||
private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
|
||||
{
|
||||
// We're building the characters up in reverse
|
||||
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
|
||||
int numCharsWritten = 0;
|
||||
do
|
||||
{
|
||||
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
|
||||
// Pop off the last nibble
|
||||
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
|
||||
value >>= 4;
|
||||
} while (value != 0);
|
||||
|
||||
// Finally, write out the HTML-encoded scalar value.
|
||||
builder.Append('&');
|
||||
builder.Append('#');
|
||||
builder.Append('x');
|
||||
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
|
||||
do
|
||||
{
|
||||
builder.Append(chars[--numCharsWritten]);
|
||||
} while (numCharsWritten != 0);
|
||||
builder.Append(';');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Represents a filter which allows only certain Unicode code points through.
|
||||
/// </summary>
|
||||
public interface ICodePointFilter
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets an enumeration of all allowed code points.
|
||||
/// </summary>
|
||||
IEnumerable<int> GetAllowedCodePoints();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Provides services for HTML-encoding input.
|
||||
/// </summary>
|
||||
public interface IHtmlEncoder
|
||||
{
|
||||
/// <summary>
|
||||
/// HTML-encodes a given input string.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// The HTML-encoded value, or null if the input string was null.
|
||||
/// </returns>
|
||||
/// <remarks>
|
||||
/// The return value is also safe for inclusion inside an HTML attribute
|
||||
/// as long as the attribute value is surrounded by single or double quotes.
|
||||
/// </remarks>
|
||||
string HtmlEncode(string value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Provides services for JavaScript-escaping strings.
|
||||
/// </summary>
|
||||
public interface IJavaScriptStringEncoder
|
||||
{
|
||||
/// <summary>
|
||||
/// JavaScript-escapes a given input string.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// The JavaScript-escaped value, or null if the input string was null.
|
||||
/// </returns>
|
||||
string JavaScriptStringEncode(string value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Provides services for URL-escaping strings.
|
||||
/// </summary>
|
||||
public interface IUrlEncoder
|
||||
{
|
||||
/// <summary>
|
||||
/// URL-escapes a given input string.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// The URL-escaped value, or null if the input string was null.
|
||||
/// </returns>
|
||||
/// <remarks>
|
||||
/// The return value is safe for use in the segment, query, or
|
||||
/// fragment portion of a URI.
|
||||
/// </remarks>
|
||||
string UrlEncode(string value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// A class which can perform JavaScript string escaping given an allow list of characters which
|
||||
/// can be represented unescaped.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Once constructed, instances of this class are thread-safe for multiple callers.
|
||||
/// </remarks>
|
||||
public sealed class JavaScriptStringEncoder : IJavaScriptStringEncoder
|
||||
{
|
||||
// The default JavaScript string encoder (Basic Latin), instantiated on demand
|
||||
private static JavaScriptStringEncoder _defaultEncoder;
|
||||
|
||||
// The inner encoder, responsible for the actual encoding routines
|
||||
private readonly JavaScriptStringUnicodeEncoder _innerUnicodeEncoder;
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
|
||||
/// </summary>
|
||||
public JavaScriptStringEncoder()
|
||||
: this(JavaScriptStringUnicodeEncoder.BasicLatin)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using a custom allow list of characters.
|
||||
/// </summary>
|
||||
public JavaScriptStringEncoder(params ICodePointFilter[] filters)
|
||||
: this(new JavaScriptStringUnicodeEncoder(filters))
|
||||
{
|
||||
}
|
||||
|
||||
private JavaScriptStringEncoder(JavaScriptStringUnicodeEncoder innerEncoder)
|
||||
{
|
||||
Debug.Assert(innerEncoder != null);
|
||||
_innerUnicodeEncoder = innerEncoder;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A default instance of the JavaScriptStringEncoder, equivalent to allowing only
|
||||
/// the 'Basic Latin' character range.
|
||||
/// </summary>
|
||||
public static JavaScriptStringEncoder Default
|
||||
{
|
||||
get
|
||||
{
|
||||
JavaScriptStringEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
|
||||
if (defaultEncoder == null)
|
||||
{
|
||||
defaultEncoder = new JavaScriptStringEncoder();
|
||||
Volatile.Write(ref _defaultEncoder, defaultEncoder);
|
||||
}
|
||||
return defaultEncoder;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Everybody's favorite JavaScriptStringEncode routine.
|
||||
/// </summary>
|
||||
public string JavaScriptStringEncode(string value)
|
||||
{
|
||||
return _innerUnicodeEncoder.Encode(value);
|
||||
}
|
||||
|
||||
private sealed class JavaScriptStringUnicodeEncoder : UnicodeEncoderBase
|
||||
{
|
||||
// A singleton instance of the basic latin encoder.
|
||||
private static JavaScriptStringUnicodeEncoder _basicLatinSingleton;
|
||||
|
||||
// The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF"
|
||||
// We don't need to worry about astral code points since they're represented as encoded
|
||||
// surrogate pairs in the output.
|
||||
private const int MaxOutputCharsPerInputChar = 6;
|
||||
|
||||
internal JavaScriptStringUnicodeEncoder(ICodePointFilter[] filters)
|
||||
: base(filters, MaxOutputCharsPerInputChar)
|
||||
{
|
||||
// The only interesting characters above and beyond what the base encoder
|
||||
// already covers are the solidus and reverse solidus.
|
||||
ForbidCharacter('\\');
|
||||
ForbidCharacter('/');
|
||||
}
|
||||
|
||||
internal static JavaScriptStringUnicodeEncoder BasicLatin
|
||||
{
|
||||
get
|
||||
{
|
||||
JavaScriptStringUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
|
||||
if (encoder == null)
|
||||
{
|
||||
encoder = new JavaScriptStringUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
|
||||
Volatile.Write(ref _basicLatinSingleton, encoder);
|
||||
}
|
||||
return encoder;
|
||||
}
|
||||
}
|
||||
|
||||
// Writes a scalar value as a JavaScript-escaped character (or sequence of characters).
|
||||
// See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9
|
||||
// http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4
|
||||
// http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
|
||||
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
|
||||
{
|
||||
// ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not.
|
||||
// Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/".
|
||||
// (In ECMA-262 this character is a NonEscape character.)
|
||||
// HTML-specific characters (including apostrophe and quotes) will
|
||||
// be written out as numeric entities for defense-in-depth.
|
||||
// See UnicodeEncoderBase ctor comments for more info.
|
||||
|
||||
if (value == (uint)'\b') { builder.Append(@"\b"); }
|
||||
else if (value == (uint)'\t') { builder.Append(@"\t"); }
|
||||
else if (value == (uint)'\n') { builder.Append(@"\n"); }
|
||||
else if (value == (uint)'\f') { builder.Append(@"\f"); }
|
||||
else if (value == (uint)'\r') { builder.Append(@"\r"); }
|
||||
else if (value == (uint)'/') { builder.Append(@"\/"); }
|
||||
else if (value == (uint)'\\') { builder.Append(@"\\"); }
|
||||
else { WriteEncodedScalarAsNumericEntity(builder, value); }
|
||||
}
|
||||
|
||||
// Writes a scalar value as an JavaScript-escaped character (or sequence of characters).
|
||||
private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
|
||||
{
|
||||
if (UnicodeHelpers.IsSupplementaryCodePoint((int)value))
|
||||
{
|
||||
// Convert this back to UTF-16 and write out both characters.
|
||||
char leadingSurrogate, trailingSurrogate;
|
||||
UnicodeHelpers.GetUtf16SurrogatePairFromAstralScalarValue((int)value, out leadingSurrogate, out trailingSurrogate);
|
||||
WriteEncodedSingleCharacter(builder, leadingSurrogate);
|
||||
WriteEncodedSingleCharacter(builder, trailingSurrogate);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This is only a single character.
|
||||
WriteEncodedSingleCharacter(builder, value);
|
||||
}
|
||||
}
|
||||
|
||||
// Writes an encoded scalar value (in the BMP) as a JavaScript-escaped character.
|
||||
private static void WriteEncodedSingleCharacter(StringBuilder builder, uint value)
|
||||
{
|
||||
Debug.Assert(!UnicodeHelpers.IsSupplementaryCodePoint((int)value), "The incoming value should've been in the BMP.");
|
||||
|
||||
// Encode this as 6 chars "\uFFFF".
|
||||
builder.Append('\\');
|
||||
builder.Append('u');
|
||||
builder.Append(HexUtil.IntToChar(value >> 12));
|
||||
builder.Append(HexUtil.IntToChar((value >> 8) & 0xFU));
|
||||
builder.Append(HexUtil.IntToChar((value >> 4) & 0xFU));
|
||||
builder.Append(HexUtil.IntToChar(value & 0xFU));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
internal unsafe abstract class UnicodeEncoderBase
|
||||
{
|
||||
// A bitmap of characters which are allowed to be returned unescaped.
|
||||
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
|
||||
|
||||
// The worst-case number of output chars generated for any input char.
|
||||
private readonly int _maxOutputCharsPerInputChar;
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using a custom allow list of characters.
|
||||
/// </summary>
|
||||
protected UnicodeEncoderBase(ICodePointFilter[] filters, int maxOutputCharsPerInputChar)
|
||||
{
|
||||
_maxOutputCharsPerInputChar = maxOutputCharsPerInputChar;
|
||||
|
||||
if (filters != null)
|
||||
{
|
||||
// Punch a hole for each allowed code point across all filters (this is an OR).
|
||||
// We don't allow supplementary (astral) characters for now.
|
||||
foreach (var filter in filters)
|
||||
{
|
||||
foreach (var codePoint in filter.GetAllowedCodePoints())
|
||||
{
|
||||
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
|
||||
{
|
||||
AllowCharacter((char)codePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Forbid characters that are special in HTML.
|
||||
// Even though this is a common encoder used by everybody (including URL
|
||||
// and JavaScript strings), it's unfortunately common for developers to
|
||||
// forget to HTML-encode a string once it has been URL-encoded or
|
||||
// JavaScript string-escaped, so this offers extra protection.
|
||||
ForbidCharacter('<');
|
||||
ForbidCharacter('>');
|
||||
ForbidCharacter('&');
|
||||
ForbidCharacter('\''); // can be used to escape attributes
|
||||
ForbidCharacter('\"'); // can be used to escape attributes
|
||||
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
|
||||
|
||||
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
|
||||
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
|
||||
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
|
||||
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
|
||||
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
|
||||
{
|
||||
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Marks a character as allowed (can be returned unencoded)
|
||||
private void AllowCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] |= 0x1U << offset;
|
||||
}
|
||||
|
||||
// Marks a character as forbidden (must be returned encoded)
|
||||
protected void ForbidCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] &= ~(0x1U << offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Entry point to the encoder.
|
||||
/// </summary>
|
||||
public string Encode(string value)
|
||||
{
|
||||
if (String.IsNullOrEmpty(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
// Quick check: does the string need to be encoded at all?
|
||||
// If not, just return the input string as-is.
|
||||
for (int i = 0; i < value.Length; i++)
|
||||
{
|
||||
if (!IsCharacterAllowed(value[i]))
|
||||
{
|
||||
return EncodeCore(value, i);
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private string EncodeCore(string input, int idxOfFirstCharWhichRequiresEncoding)
|
||||
{
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
|
||||
|
||||
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] ""
|
||||
// We don't need to worry about astral code points since they consume *two* input chars to
|
||||
// generate at most 10 output chars (""), which equates to 5 output per input.
|
||||
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
|
||||
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
|
||||
Debug.Assert(sbCapacity >= input.Length);
|
||||
|
||||
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
|
||||
// then begin encoding from the last (potentially requiring encoding) part of the input string.
|
||||
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
|
||||
fixed (char* pInput = input)
|
||||
{
|
||||
return EncodeCore2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
|
||||
}
|
||||
}
|
||||
|
||||
private string EncodeCore2(StringBuilder builder, char* input, uint charsRemaining)
|
||||
{
|
||||
while (charsRemaining != 0)
|
||||
{
|
||||
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
|
||||
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
|
||||
{
|
||||
// Supplementary characters should always be encoded numerically.
|
||||
WriteEncodedScalar(builder, (uint)nextScalar);
|
||||
|
||||
// We consume two UTF-16 characters for a single supplementary character.
|
||||
input += 2;
|
||||
charsRemaining -= 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, this was a BMP character.
|
||||
input++;
|
||||
charsRemaining--;
|
||||
char c = (char)nextScalar;
|
||||
if (IsCharacterAllowed(c))
|
||||
{
|
||||
builder.Append(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
WriteEncodedScalar(builder, (uint)nextScalar);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return builder.ToString();
|
||||
}
|
||||
|
||||
// Determines whether the given character can be returned unencoded.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool IsCharacterAllowed(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
|
||||
}
|
||||
|
||||
protected abstract void WriteEncodedScalar(StringBuilder builder, uint value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,228 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Reflection;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Threading;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// Contains helpers for dealing with Unicode code points.
|
||||
/// </summary>
|
||||
internal unsafe static class UnicodeHelpers
|
||||
{
|
||||
/// <summary>
|
||||
/// Used for invalid Unicode sequences or other unrepresentable values.
|
||||
/// </summary>
|
||||
private const char UNICODE_REPLACEMENT_CHAR = '\uFFFD';
|
||||
|
||||
/// <summary>
|
||||
/// The last code point defined by the Unicode specification.
|
||||
/// </summary>
|
||||
internal const int UNICODE_LAST_CODEPOINT = 0x10FFFF;
|
||||
|
||||
private static uint[] _definedCharacterBitmap;
|
||||
|
||||
/// <summary>
|
||||
/// Helper method which creates a bitmap of all characters which are
|
||||
/// defined per version 7.0.0 of the Unicode specification.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||
private static uint[] CreateDefinedCharacterBitmap()
|
||||
{
|
||||
// The stream should be exactly 8KB in size.
|
||||
var stream = typeof(UnicodeHelpers).GetTypeInfo().Assembly.GetManifestResourceStream("compiler/resources/unicode-7.0.0-defined-characters.bin");
|
||||
if (stream.Length != 8 * 1024)
|
||||
{
|
||||
Environment.FailFast("Corrupt data detected.");
|
||||
}
|
||||
|
||||
// Read everything in as raw bytes.
|
||||
byte[] rawData = new byte[8 * 1024];
|
||||
for (int numBytesReadTotal = 0; numBytesReadTotal < rawData.Length;)
|
||||
{
|
||||
int numBytesReadThisIteration = stream.Read(rawData, numBytesReadTotal, rawData.Length - numBytesReadTotal);
|
||||
if (numBytesReadThisIteration == 0)
|
||||
{
|
||||
Environment.FailFast("Corrupt data detected.");
|
||||
}
|
||||
numBytesReadTotal += numBytesReadThisIteration;
|
||||
}
|
||||
|
||||
// Finally, convert the byte[] to a uint[].
|
||||
// The incoming bytes are little-endian.
|
||||
uint[] retVal = new uint[2 * 1024];
|
||||
for (int i = 0; i < retVal.Length; i++)
|
||||
{
|
||||
retVal[i] = (((uint)rawData[4 * i + 3]) << 24)
|
||||
| (((uint)rawData[4 * i + 2]) << 16)
|
||||
| (((uint)rawData[4 * i + 1]) << 8)
|
||||
| (uint)rawData[4 * i];
|
||||
}
|
||||
|
||||
// And we're done!
|
||||
Volatile.Write(ref _definedCharacterBitmap, retVal);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a bitmap of all characters which are defined per version 7.0.0
|
||||
/// of the Unicode specification.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static uint[] GetDefinedCharacterBitmap()
|
||||
{
|
||||
return Volatile.Read(ref _definedCharacterBitmap) ?? CreateDefinedCharacterBitmap();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a UTF-16 character stream, reads the next scalar value from the stream.
|
||||
/// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int GetScalarValueFromUtf16(char* pChar, bool endOfString)
|
||||
{
|
||||
// This method is marked as AggressiveInlining to handle the common case of a non-surrogate
|
||||
// character. The surrogate case is handled in the slower fallback code path.
|
||||
char thisChar = *pChar;
|
||||
return (Char.IsSurrogate(thisChar)) ? GetScalarValueFromUtf16Slow(pChar, endOfString) : thisChar;
|
||||
}
|
||||
|
||||
private static int GetScalarValueFromUtf16Slow(char* pChar, bool endOfString)
|
||||
{
|
||||
char firstChar = pChar[0];
|
||||
|
||||
if (!Char.IsSurrogate(firstChar))
|
||||
{
|
||||
Debug.Fail("This case should've been handled by the fast path.");
|
||||
return firstChar;
|
||||
}
|
||||
else if (Char.IsHighSurrogate(firstChar))
|
||||
{
|
||||
if (endOfString)
|
||||
{
|
||||
// unmatched surrogate - substitute
|
||||
return UNICODE_REPLACEMENT_CHAR;
|
||||
}
|
||||
else
|
||||
{
|
||||
char secondChar = pChar[1];
|
||||
if (Char.IsLowSurrogate(secondChar))
|
||||
{
|
||||
// valid surrogate pair - extract codepoint
|
||||
return GetScalarValueFromUtf16SurrogatePair(firstChar, secondChar);
|
||||
}
|
||||
else
|
||||
{
|
||||
// unmatched surrogate - substitute
|
||||
return UNICODE_REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// unmatched surrogate - substitute
|
||||
Debug.Assert(Char.IsLowSurrogate(firstChar));
|
||||
return UNICODE_REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
|
||||
private static int GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate)
|
||||
{
|
||||
Debug.Assert(Char.IsHighSurrogate(highSurrogate));
|
||||
Debug.Assert(Char.IsLowSurrogate(lowSurrogate));
|
||||
|
||||
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
|
||||
// details of this conversion. We don't use Char.ConvertToUtf32 because its exception
|
||||
// handling shows up on the hot path, and our caller has already sanitized the inputs.
|
||||
return (lowSurrogate & 0x3ff) | (((highSurrogate & 0x3ff) + (1 << 6)) << 10);
|
||||
}
|
||||
|
||||
internal static void GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate)
|
||||
{
|
||||
Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT);
|
||||
|
||||
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
|
||||
// details of this conversion. We don't use Char.ConvertFromUtf32 because its exception
|
||||
// handling shows up on the hot path, it allocates temporary strings (which we don't want),
|
||||
// and our caller has already sanitized the inputs.
|
||||
|
||||
int x = scalar & 0xFFFF;
|
||||
int u = scalar >> 16;
|
||||
int w = u - 1;
|
||||
highSurrogate = (char)(0xD800 | (w << 6) | (x >> 10));
|
||||
lowSurrogate = (char)(0xDC00 | (x & 0x3FF));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a Unicode scalar value, returns the UTF-8 representation of the value.
|
||||
/// The return value's bytes should be popped from the LSB.
|
||||
/// </summary>
|
||||
internal static int GetUtf8RepresentationForScalarValue(uint scalar)
|
||||
{
|
||||
Debug.Assert(scalar <= UNICODE_LAST_CODEPOINT);
|
||||
|
||||
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.6 for the
|
||||
// details of this conversion. We don't use UTF8Encoding since we're encoding
|
||||
// a scalar code point, not a UTF16 character sequence.
|
||||
if (scalar <= 0x7f)
|
||||
{
|
||||
// one byte used: scalar 00000000 0xxxxxxx -> byte sequence 0xxxxxxx
|
||||
byte firstByte = (byte)scalar;
|
||||
return firstByte;
|
||||
}
|
||||
else if (scalar <= 0x7ff)
|
||||
{
|
||||
// two bytes used: scalar 00000yyy yyxxxxxx -> byte sequence 110yyyyy 10xxxxxx
|
||||
byte firstByte = (byte)(0xc0 | (scalar >> 6));
|
||||
byte secondByteByte = (byte)(0x80 | (scalar & 0x3f));
|
||||
return ((secondByteByte << 8) | firstByte);
|
||||
}
|
||||
else if (scalar <= 0xffff)
|
||||
{
|
||||
// three bytes used: scalar zzzzyyyy yyxxxxxx -> byte sequence 1110zzzz 10yyyyyy 10xxxxxx
|
||||
byte firstByte = (byte)(0xe0 | (scalar >> 12));
|
||||
byte secondByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
|
||||
byte thirdByte = (byte)(0x80 | (scalar & 0x3f));
|
||||
return ((((thirdByte << 8) | secondByte) << 8) | firstByte);
|
||||
}
|
||||
else
|
||||
{
|
||||
// four bytes used: scalar 000uuuuu zzzzyyyy yyxxxxxx -> byte sequence 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
|
||||
byte firstByte = (byte)(0xf0 | (scalar >> 18));
|
||||
byte secondByte = (byte)(0x80 | ((scalar >> 12) & 0x3f));
|
||||
byte thirdByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
|
||||
byte fourthByte = (byte)(0x80 | (scalar & 0x3f));
|
||||
return ((((((fourthByte << 8) | thirdByte) << 8) | secondByte) << 8) | firstByte);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a value stating whether a character is defined per version 7.0.0
|
||||
/// of the Unicode specification. Certain classes of characters (control chars,
|
||||
/// private use, surrogates, some whitespace) are considered "undefined" for
|
||||
/// our purposes.
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static bool IsCharacterDefined(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
return ((GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether the given scalar value is in the supplementary plane and thus
|
||||
/// requires 2 characters to be represented in UTF-16 (as a surrogate pair).
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static bool IsSupplementaryCodePoint(int scalar)
|
||||
{
|
||||
return ((scalar & ~((int)Char.MaxValue)) != 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
||||
namespace Microsoft.AspNet.WebUtilities.Encoders
|
||||
{
|
||||
/// <summary>
|
||||
/// A class which can perform URL string escaping given an allow list of characters which
|
||||
/// can be represented unescaped.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Once constructed, instances of this class are thread-safe for multiple callers.
|
||||
/// </remarks>
|
||||
public sealed class UrlEncoder : IUrlEncoder
|
||||
{
|
||||
// The default URL string encoder (Basic Latin), instantiated on demand
|
||||
private static UrlEncoder _defaultEncoder;
|
||||
|
||||
// The inner encoder, responsible for the actual encoding routines
|
||||
private readonly UrlUnicodeEncoder _innerUnicodeEncoder;
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
|
||||
/// </summary>
|
||||
public UrlEncoder()
|
||||
: this(UrlUnicodeEncoder.BasicLatin)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using a custom allow list of characters.
|
||||
/// </summary>
|
||||
public UrlEncoder(params ICodePointFilter[] filters)
|
||||
: this(new UrlUnicodeEncoder(filters))
|
||||
{
|
||||
}
|
||||
|
||||
private UrlEncoder(UrlUnicodeEncoder innerEncoder)
|
||||
{
|
||||
Debug.Assert(innerEncoder != null);
|
||||
_innerUnicodeEncoder = innerEncoder;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A default instance of the UrlEncoder, equivalent to allowing only
|
||||
/// the 'Basic Latin' character range.
|
||||
/// </summary>
|
||||
public static UrlEncoder Default
|
||||
{
|
||||
get
|
||||
{
|
||||
UrlEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
|
||||
if (defaultEncoder == null)
|
||||
{
|
||||
defaultEncoder = new UrlEncoder();
|
||||
Volatile.Write(ref _defaultEncoder, defaultEncoder);
|
||||
}
|
||||
return defaultEncoder;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Everybody's favorite UrlEncode routine.
|
||||
/// </summary>
|
||||
public string UrlEncode(string value)
|
||||
{
|
||||
return _innerUnicodeEncoder.Encode(value);
|
||||
}
|
||||
|
||||
private sealed class UrlUnicodeEncoder : UnicodeEncoderBase
|
||||
{
|
||||
// A singleton instance of the basic latin encoder.
|
||||
private static UrlUnicodeEncoder _basicLatinSingleton;
|
||||
|
||||
// We perform UTF8 conversion of input, which means that the worst case is
|
||||
// 9 output chars per input char: [input] U+FFFF -> [output] "%XX%YY%ZZ".
|
||||
// We don't need to worry about astral code points since they consume 2 input
|
||||
// chars to produce 12 output chars "%XX%YY%ZZ%WW", which is 6 output chars per input char.
|
||||
private const int MaxOutputCharsPerInputChar = 9;
|
||||
|
||||
internal UrlUnicodeEncoder(ICodePointFilter[] filters)
|
||||
: base(filters, MaxOutputCharsPerInputChar)
|
||||
{
|
||||
// Per RFC 3987, Sec. 2.2, we want encodings that are safe for
|
||||
// 'isegment', 'iquery', and 'ifragment'. The only thing these
|
||||
// all have in common is 'ipchar', which is defined as such:
|
||||
//
|
||||
// ipchar = iunreserved / pct-encoded / sub-delims / ":"
|
||||
// / "@"
|
||||
//
|
||||
// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
|
||||
//
|
||||
// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
|
||||
// / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
|
||||
// / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
|
||||
// / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
|
||||
// / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
|
||||
// / %xD0000-DFFFD / %xE1000-EFFFD
|
||||
//
|
||||
// pct-encoded = "%" HEXDIG HEXDIG
|
||||
//
|
||||
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
|
||||
// / "*" / "+" / "," / ";" / "="
|
||||
//
|
||||
// From this list, the base encoder blocks "&", "'", "+",
|
||||
// and we'll additionally block "=" since it has special meaning
|
||||
// in x-www-form-urlencoded representations.
|
||||
//
|
||||
// This means that the full list of allowed characters from the
|
||||
// Basic Latin set is:
|
||||
// ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / "@"
|
||||
|
||||
const string forbiddenChars = @" #%/=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
|
||||
foreach (char c in forbiddenChars)
|
||||
{
|
||||
ForbidCharacter(c);
|
||||
}
|
||||
|
||||
// Specials (U+FFF0 .. U+FFFF) are forbidden by the definition of 'ucschar' above
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
ForbidCharacter((char)(0xFFF0 | i));
|
||||
}
|
||||
|
||||
// Supplementary characters are forbidden anyway by the base encoder
|
||||
}
|
||||
|
||||
internal static UrlUnicodeEncoder BasicLatin
|
||||
{
|
||||
get
|
||||
{
|
||||
UrlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
|
||||
if (encoder == null)
|
||||
{
|
||||
encoder = new UrlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
|
||||
Volatile.Write(ref _basicLatinSingleton, encoder);
|
||||
}
|
||||
return encoder;
|
||||
}
|
||||
}
|
||||
|
||||
// Writes a scalar value as a percent-encoded sequence of UTF8 bytes, per RFC 3987.
|
||||
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
|
||||
{
|
||||
uint asUtf8 = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue(value);
|
||||
do
|
||||
{
|
||||
char highNibble, lowNibble;
|
||||
HexUtil.WriteHexEncodedByte((byte)asUtf8, out highNibble, out lowNibble);
|
||||
builder.Append('%');
|
||||
builder.Append(highNibble);
|
||||
builder.Append(lowNibble);
|
||||
} while ((asUtf8 >>= 8) != 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
|
@ -1,6 +1,9 @@
|
|||
{
|
||||
"version": "1.0.0-*",
|
||||
"description": "ASP.NET 5 common helper methods such as URL encoding.",
|
||||
"compilationOptions": {
|
||||
"allowUnsafe": true
|
||||
},
|
||||
"dependencies": {
|
||||
},
|
||||
"frameworks": {
|
||||
|
|
@ -11,8 +14,11 @@
|
|||
"System.Diagnostics.Debug": "4.0.10-beta-*",
|
||||
"System.IO": "4.0.10-beta-*",
|
||||
"System.IO.FileSystem": "4.0.0-beta-*",
|
||||
"System.Linq": "4.0.0-beta-*",
|
||||
"System.Reflection.TypeExtensions": "4.0.0-beta-*",
|
||||
"System.Runtime": "4.0.20-beta-*",
|
||||
"System.Runtime.Extensions": "4.0.10-beta-*"
|
||||
"System.Runtime.Extensions": "4.0.10-beta-*",
|
||||
"System.Threading": "4.0.10-beta-*"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue