Add HtmlEncoder, UrlEncoder, and JavaScriptStringEncoder

Also add interfaces for abstracting each of these
Unit tests are not in yet but are coming soon
This commit is contained in:
Levi B 2015-02-07 16:03:40 -08:00
parent dadd9cd9f3
commit 1008e17259
14 changed files with 3719 additions and 1 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,37 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
internal static class EncoderCommon
{
// Gets the optimal capacity of the StringBuilder that will be used to build the output
// given a specified number of input characters and the worst-case growth.
public static int GetCapacityOfOutputStringBuilder(int numCharsToEncode, int worstCaseOutputCharsPerInputChar)
{
// We treat 32KB byte size (16k chars) as a soft upper boundary for the length of any StringBuilder
// that we allocate. We'll try to avoid going above this boundary if we can avoid it so that we
// don't allocate objects on the LOH.
const int upperBound = 16 * 1024;
// Once we have chosen an initial value for the StringBuilder size, the StringBuilder type will
// efficiently allocate additionally blocks if necessary.
if (numCharsToEncode >= upperBound)
{
// We know that the output will contain at least as many characters as the input, so if the
// input length exceeds the soft upper boundary just preallocate the entire builder and hope for
// a best-case outcome.
return numCharsToEncode;
}
else
{
// Allocate the worst-case if we can, but don't exceed the soft upper boundary.
long worstCaseTotalChars = (long)numCharsToEncode * worstCaseOutputCharsPerInputChar;
return (int)Math.Min(upperBound, worstCaseTotalChars);
}
}
}
}

View File

@ -0,0 +1,48 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Contains helpers for dealing with byte-hex char conversions.
/// </summary>
internal static class HexUtil
{
/// <summary>
/// Converts a number 0 - 15 to its associated hex character '0' - 'F'.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char IntToChar(uint i)
{
Debug.Assert(i < 16);
return (i < 10) ? (char)('0' + i) : (char)('A' + (i - 10));
}
/// <summary>
/// Returns the integral form of this hexadecimal character.
/// </summary>
/// <returns>0 - 15 if the character is valid, -1 if the character is invalid.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int ParseHexCharacter(char c)
{
if ('0' <= c && c <= '9') { return c - '0'; }
else if ('A' <= c && c <= 'F') { return c - 'A' + 10; }
else if ('a' <= c && c <= 'f') { return c - 'a' + 10; }
else { return -1; }
}
/// <summary>
/// Gets the uppercase hex-encoded form of a byte.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void WriteHexEncodedByte(byte b, out char firstHexChar, out char secondHexChar)
{
firstHexChar = IntToChar((uint)b >> 4);
secondHexChar = IntToChar((uint)b & 0xFU);
}
}
}

View File

@ -0,0 +1,229 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// A class which can perform HTML encoding given an allow list of characters which
/// can be represented unencoded.
/// </summary>
/// <remarks>
/// Once constructed, instances of this class are thread-safe for multiple callers.
/// </remarks>
public unsafe sealed class HtmlEncoder : IHtmlEncoder
{
// The default HtmlEncoder (Basic Latin), instantiated on demand
private static HtmlEncoder _defaultEncoder;
// A bitmap of characters which are allowed to be returned unescaped.
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
/// <summary>
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
/// </summary>
public HtmlEncoder()
: this(CodePointFilters.BasicLatin)
{
}
/// <summary>
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
public HtmlEncoder(params ICodePointFilter[] filters)
{
if (filters == null)
{
return; // no characters are allowed, just no-op immediately
}
// Punch a hole for each allowed code point across all filters (this is an OR).
// We don't allow supplementary (astral) characters for now.
foreach (var filter in filters)
{
foreach (var codePoint in filter.GetAllowedCodePoints())
{
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
{
AllowCharacter((char)codePoint);
}
}
}
// Forbid characters that are special in HTML
ForbidCharacter('<');
ForbidCharacter('>');
ForbidCharacter('&');
ForbidCharacter('\''); // can be used to escape attributes
ForbidCharacter('\"'); // can be used to escape attributes
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
{
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
}
}
/// <summary>
/// A default instance of the HtmlEncoder, equivalent to allowing only
/// the 'Basic Latin' character range.
/// </summary>
public static HtmlEncoder Default
{
get
{
HtmlEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
if (defaultEncoder == null)
{
defaultEncoder = new HtmlEncoder();
Volatile.Write(ref _defaultEncoder, defaultEncoder);
}
return defaultEncoder;
}
}
// Marks a character as allowed (can be returned unencoded)
private void AllowCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] |= 0x1U << offset;
}
// Marks a character as forbidden (must be returned encoded)
private void ForbidCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] &= ~(0x1U << offset);
}
/// <summary>
/// Everybody's favorite HtmlEncode routine.
/// </summary>
public string HtmlEncode(string value)
{
if (String.IsNullOrEmpty(value))
{
return value;
}
// Quick check: does the string need to be encoded at all?
// If not, just return the input string as-is.
for (int i = 0; i < value.Length; i++)
{
if (!IsCharacterAllowed(value[i]))
{
return HtmlEncodeImpl(value, i);
}
}
return value;
}
private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
{
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] "&#xFFFF;"
// We don't need to worry about astral code points since they consume *two* input chars to
// generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output per input.
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
Debug.Assert(sbCapacity >= input.Length);
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
// then begin encoding from the last (potentially requiring encoding) part of the input string.
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
fixed (char* pInput = input)
{
return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
}
}
private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
{
while (charsRemaining != 0)
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
{
// Supplementary characters should always be encoded numerically.
WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
// We consume two UTF-16 characters for a single supplementary character.
input += 2;
charsRemaining -= 2;
}
else
{
// Otherwise, this was a BMP character.
input++;
charsRemaining--;
char c = (char)nextScalar;
if (IsCharacterAllowed(c))
{
builder.Append(c);
}
else
{
if (c == '<') { builder.Append("&lt;"); }
else if (c == '>') { builder.Append("&gt;"); }
else if (c == '&') { builder.Append("&amp;"); }
else if (c == '\"') { builder.Append("&quot;"); }
else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
}
}
}
return builder.ToString();
}
// Determines whether the given character can be returned unencoded.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool IsCharacterAllowed(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
}
// Writes a scalar value as "&#xFFFFFFFF;"
private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
{
// We're building the characters up in reverse
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
int numCharsWritten = 0;
do
{
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
// Pop off the last nibble
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
value >>= 4;
} while (value != 0);
// Finally, write out the HTML-encoded scalar value.
builder.Append('&');
builder.Append('#');
builder.Append('x');
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
do
{
builder.Append(chars[--numCharsWritten]);
} while (numCharsWritten != 0);
builder.Append(';');
}
}
}

View File

@ -0,0 +1,19 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Collections.Generic;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Represents a filter which allows only certain Unicode code points through.
/// </summary>
public interface ICodePointFilter
{
/// <summary>
/// Gets an enumeration of all allowed code points.
/// </summary>
IEnumerable<int> GetAllowedCodePoints();
}
}

View File

@ -0,0 +1,25 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Provides services for HTML-encoding input.
/// </summary>
public interface IHtmlEncoder
{
/// <summary>
/// HTML-encodes a given input string.
/// </summary>
/// <returns>
/// The HTML-encoded value, or null if the input string was null.
/// </returns>
/// <remarks>
/// The return value is also safe for inclusion inside an HTML attribute
/// as long as the attribute value is surrounded by single or double quotes.
/// </remarks>
string HtmlEncode(string value);
}
}

View File

@ -0,0 +1,21 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Provides services for JavaScript-escaping strings.
/// </summary>
public interface IJavaScriptStringEncoder
{
/// <summary>
/// JavaScript-escapes a given input string.
/// </summary>
/// <returns>
/// The JavaScript-escaped value, or null if the input string was null.
/// </returns>
string JavaScriptStringEncode(string value);
}
}

View File

@ -0,0 +1,25 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Provides services for URL-escaping strings.
/// </summary>
public interface IUrlEncoder
{
/// <summary>
/// URL-escapes a given input string.
/// </summary>
/// <returns>
/// The URL-escaped value, or null if the input string was null.
/// </returns>
/// <remarks>
/// The return value is safe for use in the segment, query, or
/// fragment portion of a URI.
/// </remarks>
string UrlEncode(string value);
}
}

View File

@ -0,0 +1,163 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Text;
using System.Threading;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// A class which can perform JavaScript string escaping given an allow list of characters which
/// can be represented unescaped.
/// </summary>
/// <remarks>
/// Once constructed, instances of this class are thread-safe for multiple callers.
/// </remarks>
public sealed class JavaScriptStringEncoder : IJavaScriptStringEncoder
{
// The default JavaScript string encoder (Basic Latin), instantiated on demand
private static JavaScriptStringEncoder _defaultEncoder;
// The inner encoder, responsible for the actual encoding routines
private readonly JavaScriptStringUnicodeEncoder _innerUnicodeEncoder;
/// <summary>
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
/// </summary>
public JavaScriptStringEncoder()
: this(JavaScriptStringUnicodeEncoder.BasicLatin)
{
}
/// <summary>
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
public JavaScriptStringEncoder(params ICodePointFilter[] filters)
: this(new JavaScriptStringUnicodeEncoder(filters))
{
}
private JavaScriptStringEncoder(JavaScriptStringUnicodeEncoder innerEncoder)
{
Debug.Assert(innerEncoder != null);
_innerUnicodeEncoder = innerEncoder;
}
/// <summary>
/// A default instance of the JavaScriptStringEncoder, equivalent to allowing only
/// the 'Basic Latin' character range.
/// </summary>
public static JavaScriptStringEncoder Default
{
get
{
JavaScriptStringEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
if (defaultEncoder == null)
{
defaultEncoder = new JavaScriptStringEncoder();
Volatile.Write(ref _defaultEncoder, defaultEncoder);
}
return defaultEncoder;
}
}
/// <summary>
/// Everybody's favorite JavaScriptStringEncode routine.
/// </summary>
public string JavaScriptStringEncode(string value)
{
return _innerUnicodeEncoder.Encode(value);
}
private sealed class JavaScriptStringUnicodeEncoder : UnicodeEncoderBase
{
// A singleton instance of the basic latin encoder.
private static JavaScriptStringUnicodeEncoder _basicLatinSingleton;
// The worst case encoding is 6 output chars per input char: [input] U+FFFF -> [output] "\uFFFF"
// We don't need to worry about astral code points since they're represented as encoded
// surrogate pairs in the output.
private const int MaxOutputCharsPerInputChar = 6;
internal JavaScriptStringUnicodeEncoder(ICodePointFilter[] filters)
: base(filters, MaxOutputCharsPerInputChar)
{
// The only interesting characters above and beyond what the base encoder
// already covers are the solidus and reverse solidus.
ForbidCharacter('\\');
ForbidCharacter('/');
}
internal static JavaScriptStringUnicodeEncoder BasicLatin
{
get
{
JavaScriptStringUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
if (encoder == null)
{
encoder = new JavaScriptStringUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
Volatile.Write(ref _basicLatinSingleton, encoder);
}
return encoder;
}
}
// Writes a scalar value as a JavaScript-escaped character (or sequence of characters).
// See ECMA-262, Sec. 7.8.4, and ECMA-404, Sec. 9
// http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4
// http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-404.pdf
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
{
// ECMA-262 allows encoding U+000B as "\v", but ECMA-404 does not.
// Both ECMA-262 and ECMA-404 allow encoding U+002F SOLIDUS as "\/".
// (In ECMA-262 this character is a NonEscape character.)
// HTML-specific characters (including apostrophe and quotes) will
// be written out as numeric entities for defense-in-depth.
// See UnicodeEncoderBase ctor comments for more info.
if (value == (uint)'\b') { builder.Append(@"\b"); }
else if (value == (uint)'\t') { builder.Append(@"\t"); }
else if (value == (uint)'\n') { builder.Append(@"\n"); }
else if (value == (uint)'\f') { builder.Append(@"\f"); }
else if (value == (uint)'\r') { builder.Append(@"\r"); }
else if (value == (uint)'/') { builder.Append(@"\/"); }
else if (value == (uint)'\\') { builder.Append(@"\\"); }
else { WriteEncodedScalarAsNumericEntity(builder, value); }
}
// Writes a scalar value as an JavaScript-escaped character (or sequence of characters).
private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
{
if (UnicodeHelpers.IsSupplementaryCodePoint((int)value))
{
// Convert this back to UTF-16 and write out both characters.
char leadingSurrogate, trailingSurrogate;
UnicodeHelpers.GetUtf16SurrogatePairFromAstralScalarValue((int)value, out leadingSurrogate, out trailingSurrogate);
WriteEncodedSingleCharacter(builder, leadingSurrogate);
WriteEncodedSingleCharacter(builder, trailingSurrogate);
}
else
{
// This is only a single character.
WriteEncodedSingleCharacter(builder, value);
}
}
// Writes an encoded scalar value (in the BMP) as a JavaScript-escaped character.
private static void WriteEncodedSingleCharacter(StringBuilder builder, uint value)
{
Debug.Assert(!UnicodeHelpers.IsSupplementaryCodePoint((int)value), "The incoming value should've been in the BMP.");
// Encode this as 6 chars "\uFFFF".
builder.Append('\\');
builder.Append('u');
builder.Append(HexUtil.IntToChar(value >> 12));
builder.Append(HexUtil.IntToChar((value >> 8) & 0xFU));
builder.Append(HexUtil.IntToChar((value >> 4) & 0xFU));
builder.Append(HexUtil.IntToChar(value & 0xFU));
}
}
}
}

View File

@ -0,0 +1,171 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
internal unsafe abstract class UnicodeEncoderBase
{
// A bitmap of characters which are allowed to be returned unescaped.
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
// The worst-case number of output chars generated for any input char.
private readonly int _maxOutputCharsPerInputChar;
/// <summary>
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
protected UnicodeEncoderBase(ICodePointFilter[] filters, int maxOutputCharsPerInputChar)
{
_maxOutputCharsPerInputChar = maxOutputCharsPerInputChar;
if (filters != null)
{
// Punch a hole for each allowed code point across all filters (this is an OR).
// We don't allow supplementary (astral) characters for now.
foreach (var filter in filters)
{
foreach (var codePoint in filter.GetAllowedCodePoints())
{
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
{
AllowCharacter((char)codePoint);
}
}
}
}
// Forbid characters that are special in HTML.
// Even though this is a common encoder used by everybody (including URL
// and JavaScript strings), it's unfortunately common for developers to
// forget to HTML-encode a string once it has been URL-encoded or
// JavaScript string-escaped, so this offers extra protection.
ForbidCharacter('<');
ForbidCharacter('>');
ForbidCharacter('&');
ForbidCharacter('\''); // can be used to escape attributes
ForbidCharacter('\"'); // can be used to escape attributes
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
{
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
}
}
// Marks a character as allowed (can be returned unencoded)
private void AllowCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] |= 0x1U << offset;
}
// Marks a character as forbidden (must be returned encoded)
protected void ForbidCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] &= ~(0x1U << offset);
}
/// <summary>
/// Entry point to the encoder.
/// </summary>
public string Encode(string value)
{
if (String.IsNullOrEmpty(value))
{
return value;
}
// Quick check: does the string need to be encoded at all?
// If not, just return the input string as-is.
for (int i = 0; i < value.Length; i++)
{
if (!IsCharacterAllowed(value[i]))
{
return EncodeCore(value, i);
}
}
return value;
}
private string EncodeCore(string input, int idxOfFirstCharWhichRequiresEncoding)
{
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] "&#xFFFF;"
// We don't need to worry about astral code points since they consume *two* input chars to
// generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output per input.
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
Debug.Assert(sbCapacity >= input.Length);
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
// then begin encoding from the last (potentially requiring encoding) part of the input string.
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
fixed (char* pInput = input)
{
return EncodeCore2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
}
}
private string EncodeCore2(StringBuilder builder, char* input, uint charsRemaining)
{
while (charsRemaining != 0)
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
{
// Supplementary characters should always be encoded numerically.
WriteEncodedScalar(builder, (uint)nextScalar);
// We consume two UTF-16 characters for a single supplementary character.
input += 2;
charsRemaining -= 2;
}
else
{
// Otherwise, this was a BMP character.
input++;
charsRemaining--;
char c = (char)nextScalar;
if (IsCharacterAllowed(c))
{
builder.Append(c);
}
else
{
WriteEncodedScalar(builder, (uint)nextScalar);
}
}
}
return builder.ToString();
}
// Determines whether the given character can be returned unencoded.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool IsCharacterAllowed(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
}
protected abstract void WriteEncodedScalar(StringBuilder builder, uint value);
}
}

View File

@ -0,0 +1,228 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Threading;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// Contains helpers for dealing with Unicode code points.
/// </summary>
internal unsafe static class UnicodeHelpers
{
/// <summary>
/// Used for invalid Unicode sequences or other unrepresentable values.
/// </summary>
private const char UNICODE_REPLACEMENT_CHAR = '\uFFFD';
/// <summary>
/// The last code point defined by the Unicode specification.
/// </summary>
internal const int UNICODE_LAST_CODEPOINT = 0x10FFFF;
private static uint[] _definedCharacterBitmap;
/// <summary>
/// Helper method which creates a bitmap of all characters which are
/// defined per version 7.0.0 of the Unicode specification.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
private static uint[] CreateDefinedCharacterBitmap()
{
// The stream should be exactly 8KB in size.
var stream = typeof(UnicodeHelpers).GetTypeInfo().Assembly.GetManifestResourceStream("compiler/resources/unicode-7.0.0-defined-characters.bin");
if (stream.Length != 8 * 1024)
{
Environment.FailFast("Corrupt data detected.");
}
// Read everything in as raw bytes.
byte[] rawData = new byte[8 * 1024];
for (int numBytesReadTotal = 0; numBytesReadTotal < rawData.Length;)
{
int numBytesReadThisIteration = stream.Read(rawData, numBytesReadTotal, rawData.Length - numBytesReadTotal);
if (numBytesReadThisIteration == 0)
{
Environment.FailFast("Corrupt data detected.");
}
numBytesReadTotal += numBytesReadThisIteration;
}
// Finally, convert the byte[] to a uint[].
// The incoming bytes are little-endian.
uint[] retVal = new uint[2 * 1024];
for (int i = 0; i < retVal.Length; i++)
{
retVal[i] = (((uint)rawData[4 * i + 3]) << 24)
| (((uint)rawData[4 * i + 2]) << 16)
| (((uint)rawData[4 * i + 1]) << 8)
| (uint)rawData[4 * i];
}
// And we're done!
Volatile.Write(ref _definedCharacterBitmap, retVal);
return retVal;
}
/// <summary>
/// Returns a bitmap of all characters which are defined per version 7.0.0
/// of the Unicode specification.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static uint[] GetDefinedCharacterBitmap()
{
return Volatile.Read(ref _definedCharacterBitmap) ?? CreateDefinedCharacterBitmap();
}
/// <summary>
/// Given a UTF-16 character stream, reads the next scalar value from the stream.
/// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetScalarValueFromUtf16(char* pChar, bool endOfString)
{
// This method is marked as AggressiveInlining to handle the common case of a non-surrogate
// character. The surrogate case is handled in the slower fallback code path.
char thisChar = *pChar;
return (Char.IsSurrogate(thisChar)) ? GetScalarValueFromUtf16Slow(pChar, endOfString) : thisChar;
}
private static int GetScalarValueFromUtf16Slow(char* pChar, bool endOfString)
{
char firstChar = pChar[0];
if (!Char.IsSurrogate(firstChar))
{
Debug.Fail("This case should've been handled by the fast path.");
return firstChar;
}
else if (Char.IsHighSurrogate(firstChar))
{
if (endOfString)
{
// unmatched surrogate - substitute
return UNICODE_REPLACEMENT_CHAR;
}
else
{
char secondChar = pChar[1];
if (Char.IsLowSurrogate(secondChar))
{
// valid surrogate pair - extract codepoint
return GetScalarValueFromUtf16SurrogatePair(firstChar, secondChar);
}
else
{
// unmatched surrogate - substitute
return UNICODE_REPLACEMENT_CHAR;
}
}
}
else
{
// unmatched surrogate - substitute
Debug.Assert(Char.IsLowSurrogate(firstChar));
return UNICODE_REPLACEMENT_CHAR;
}
}
private static int GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate)
{
Debug.Assert(Char.IsHighSurrogate(highSurrogate));
Debug.Assert(Char.IsLowSurrogate(lowSurrogate));
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
// details of this conversion. We don't use Char.ConvertToUtf32 because its exception
// handling shows up on the hot path, and our caller has already sanitized the inputs.
return (lowSurrogate & 0x3ff) | (((highSurrogate & 0x3ff) + (1 << 6)) << 10);
}
internal static void GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate)
{
Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT);
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
// details of this conversion. We don't use Char.ConvertFromUtf32 because its exception
// handling shows up on the hot path, it allocates temporary strings (which we don't want),
// and our caller has already sanitized the inputs.
int x = scalar & 0xFFFF;
int u = scalar >> 16;
int w = u - 1;
highSurrogate = (char)(0xD800 | (w << 6) | (x >> 10));
lowSurrogate = (char)(0xDC00 | (x & 0x3FF));
}
/// <summary>
/// Given a Unicode scalar value, returns the UTF-8 representation of the value.
/// The return value's bytes should be popped from the LSB.
/// </summary>
internal static int GetUtf8RepresentationForScalarValue(uint scalar)
{
Debug.Assert(scalar <= UNICODE_LAST_CODEPOINT);
// See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.6 for the
// details of this conversion. We don't use UTF8Encoding since we're encoding
// a scalar code point, not a UTF16 character sequence.
if (scalar <= 0x7f)
{
// one byte used: scalar 00000000 0xxxxxxx -> byte sequence 0xxxxxxx
byte firstByte = (byte)scalar;
return firstByte;
}
else if (scalar <= 0x7ff)
{
// two bytes used: scalar 00000yyy yyxxxxxx -> byte sequence 110yyyyy 10xxxxxx
byte firstByte = (byte)(0xc0 | (scalar >> 6));
byte secondByteByte = (byte)(0x80 | (scalar & 0x3f));
return ((secondByteByte << 8) | firstByte);
}
else if (scalar <= 0xffff)
{
// three bytes used: scalar zzzzyyyy yyxxxxxx -> byte sequence 1110zzzz 10yyyyyy 10xxxxxx
byte firstByte = (byte)(0xe0 | (scalar >> 12));
byte secondByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
byte thirdByte = (byte)(0x80 | (scalar & 0x3f));
return ((((thirdByte << 8) | secondByte) << 8) | firstByte);
}
else
{
// four bytes used: scalar 000uuuuu zzzzyyyy yyxxxxxx -> byte sequence 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
byte firstByte = (byte)(0xf0 | (scalar >> 18));
byte secondByte = (byte)(0x80 | ((scalar >> 12) & 0x3f));
byte thirdByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
byte fourthByte = (byte)(0x80 | (scalar & 0x3f));
return ((((((fourthByte << 8) | thirdByte) << 8) | secondByte) << 8) | firstByte);
}
}
/// <summary>
/// Returns a value stating whether a character is defined per version 7.0.0
/// of the Unicode specification. Certain classes of characters (control chars,
/// private use, surrogates, some whitespace) are considered "undefined" for
/// our purposes.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsCharacterDefined(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
return ((GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0;
}
/// <summary>
/// Determines whether the given scalar value is in the supplementary plane and thus
/// requires 2 characters to be represented in UTF-16 (as a surrogate pair).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsSupplementaryCodePoint(int scalar)
{
return ((scalar & ~((int)Char.MaxValue)) != 0);
}
}
}

View File

@ -0,0 +1,161 @@
// Copyright (c) Microsoft Open Technologies, Inc. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.Text;
using System.Threading;
namespace Microsoft.AspNet.WebUtilities.Encoders
{
/// <summary>
/// A class which can perform URL string escaping given an allow list of characters which
/// can be represented unescaped.
/// </summary>
/// <remarks>
/// Once constructed, instances of this class are thread-safe for multiple callers.
/// </remarks>
public sealed class UrlEncoder : IUrlEncoder
{
// The default URL string encoder (Basic Latin), instantiated on demand
private static UrlEncoder _defaultEncoder;
// The inner encoder, responsible for the actual encoding routines
private readonly UrlUnicodeEncoder _innerUnicodeEncoder;
/// <summary>
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
/// </summary>
public UrlEncoder()
: this(UrlUnicodeEncoder.BasicLatin)
{
}
/// <summary>
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
public UrlEncoder(params ICodePointFilter[] filters)
: this(new UrlUnicodeEncoder(filters))
{
}
private UrlEncoder(UrlUnicodeEncoder innerEncoder)
{
Debug.Assert(innerEncoder != null);
_innerUnicodeEncoder = innerEncoder;
}
/// <summary>
/// A default instance of the UrlEncoder, equivalent to allowing only
/// the 'Basic Latin' character range.
/// </summary>
public static UrlEncoder Default
{
get
{
UrlEncoder defaultEncoder = Volatile.Read(ref _defaultEncoder);
if (defaultEncoder == null)
{
defaultEncoder = new UrlEncoder();
Volatile.Write(ref _defaultEncoder, defaultEncoder);
}
return defaultEncoder;
}
}
/// <summary>
/// Everybody's favorite UrlEncode routine.
/// </summary>
public string UrlEncode(string value)
{
return _innerUnicodeEncoder.Encode(value);
}
private sealed class UrlUnicodeEncoder : UnicodeEncoderBase
{
// A singleton instance of the basic latin encoder.
private static UrlUnicodeEncoder _basicLatinSingleton;
// We perform UTF8 conversion of input, which means that the worst case is
// 9 output chars per input char: [input] U+FFFF -> [output] "%XX%YY%ZZ".
// We don't need to worry about astral code points since they consume 2 input
// chars to produce 12 output chars "%XX%YY%ZZ%WW", which is 6 output chars per input char.
private const int MaxOutputCharsPerInputChar = 9;
internal UrlUnicodeEncoder(ICodePointFilter[] filters)
: base(filters, MaxOutputCharsPerInputChar)
{
// Per RFC 3987, Sec. 2.2, we want encodings that are safe for
// 'isegment', 'iquery', and 'ifragment'. The only thing these
// all have in common is 'ipchar', which is defined as such:
//
// ipchar = iunreserved / pct-encoded / sub-delims / ":"
// / "@"
//
// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
//
// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
// / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
// / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
// / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
// / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
// / %xD0000-DFFFD / %xE1000-EFFFD
//
// pct-encoded = "%" HEXDIG HEXDIG
//
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
//
// From this list, the base encoder blocks "&", "'", "+",
// and we'll additionally block "=" since it has special meaning
// in x-www-form-urlencoded representations.
//
// This means that the full list of allowed characters from the
// Basic Latin set is:
// ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / "@"
const string forbiddenChars = @" #%/=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
foreach (char c in forbiddenChars)
{
ForbidCharacter(c);
}
// Specials (U+FFF0 .. U+FFFF) are forbidden by the definition of 'ucschar' above
for (int i = 0; i < 16; i++)
{
ForbidCharacter((char)(0xFFF0 | i));
}
// Supplementary characters are forbidden anyway by the base encoder
}
internal static UrlUnicodeEncoder BasicLatin
{
get
{
UrlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
if (encoder == null)
{
encoder = new UrlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
Volatile.Write(ref _basicLatinSingleton, encoder);
}
return encoder;
}
}
// Writes a scalar value as a percent-encoded sequence of UTF8 bytes, per RFC 3987.
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
{
uint asUtf8 = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue(value);
do
{
char highNibble, lowNibble;
HexUtil.WriteHexEncodedByte((byte)asUtf8, out highNibble, out lowNibble);
builder.Append('%');
builder.Append(highNibble);
builder.Append(lowNibble);
} while ((asUtf8 >>= 8) != 0);
}
}
}
}

View File

@ -1,6 +1,9 @@
{
"version": "1.0.0-*",
"description": "ASP.NET 5 common helper methods such as URL encoding.",
"compilationOptions": {
"allowUnsafe": true
},
"dependencies": {
},
"frameworks": {
@ -11,8 +14,11 @@
"System.Diagnostics.Debug": "4.0.10-beta-*",
"System.IO": "4.0.10-beta-*",
"System.IO.FileSystem": "4.0.0-beta-*",
"System.Linq": "4.0.0-beta-*",
"System.Reflection.TypeExtensions": "4.0.0-beta-*",
"System.Runtime": "4.0.20-beta-*",
"System.Runtime.Extensions": "4.0.10-beta-*"
"System.Runtime.Extensions": "4.0.10-beta-*",
"System.Threading": "4.0.10-beta-*"
}
}
}