Update HtmlEncoder to wrap UnicodeEncoderBase

This commit is contained in:
Levi B 2015-02-11 12:03:47 -08:00
parent fc7ed3a9cd
commit 14c872d981
1 changed files with 56 additions and 145 deletions

View File

@ -3,7 +3,6 @@
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;
@ -21,14 +20,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
// The default HtmlEncoder (Basic Latin), instantiated on demand
private static HtmlEncoder _defaultEncoder;
// A bitmap of characters which are allowed to be returned unescaped.
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
// The inner encoder, responsible for the actual encoding routines
private readonly HtmlUnicodeEncoder _innerUnicodeEncoder;
/// <summary>
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
/// </summary>
public HtmlEncoder()
: this(CodePointFilters.BasicLatin)
: this(HtmlUnicodeEncoder.BasicLatin)
{
}
@ -36,41 +35,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
public HtmlEncoder(params ICodePointFilter[] filters)
: this(new HtmlUnicodeEncoder(filters))
{
if (filters == null)
{
return; // no characters are allowed, just no-op immediately
}
}
// Punch a hole for each allowed code point across all filters (this is an OR).
// We don't allow supplementary (astral) characters for now.
foreach (var filter in filters)
{
foreach (var codePoint in filter.GetAllowedCodePoints())
{
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
{
AllowCharacter((char)codePoint);
}
}
}
// Forbid characters that are special in HTML
ForbidCharacter('<');
ForbidCharacter('>');
ForbidCharacter('&');
ForbidCharacter('\''); // can be used to escape attributes
ForbidCharacter('\"'); // can be used to escape attributes
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
{
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
}
private HtmlEncoder(HtmlUnicodeEncoder innerEncoder)
{
Debug.Assert(innerEncoder != null);
_innerUnicodeEncoder = innerEncoder;
}
/// <summary>
@ -91,139 +63,78 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
}
}
// Marks a character as allowed (can be returned unencoded)
private void AllowCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] |= 0x1U << offset;
}
// Marks a character as forbidden (must be returned encoded)
private void ForbidCharacter(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
_allowedCharsBitmap[index] &= ~(0x1U << offset);
}
/// <summary>
/// Everybody's favorite HtmlEncode routine.
/// </summary>
public string HtmlEncode(string value)
{
if (String.IsNullOrEmpty(value))
{
return value;
}
// Quick check: does the string need to be encoded at all?
// If not, just return the input string as-is.
for (int i = 0; i < value.Length; i++)
{
if (!IsCharacterAllowed(value[i]))
{
return HtmlEncodeImpl(value, i);
}
}
return value;
return _innerUnicodeEncoder.Encode(value);
}
private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
private sealed class HtmlUnicodeEncoder : UnicodeEncoderBase
{
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
// A singleton instance of the basic latin encoder.
private static HtmlUnicodeEncoder _basicLatinSingleton;
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] "&#xFFFF;"
// We don't need to worry about astral code points since they consume *two* input chars to
// generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output per input.
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
Debug.Assert(sbCapacity >= input.Length);
// generate at most 10 output chars ("&#x10FFFF;"), which equates to 5 output chars per input char.
private const int MaxOutputCharsPerInputChar = 8;
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
// then begin encoding from the last (potentially requiring encoding) part of the input string.
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
fixed (char* pInput = input)
internal HtmlUnicodeEncoder(ICodePointFilter[] filters)
: base(filters, MaxOutputCharsPerInputChar)
{
return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
}
}
private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
{
while (charsRemaining != 0)
internal static HtmlUnicodeEncoder BasicLatin
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
get
{
// Supplementary characters should always be encoded numerically.
WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
// We consume two UTF-16 characters for a single supplementary character.
input += 2;
charsRemaining -= 2;
}
else
{
// Otherwise, this was a BMP character.
input++;
charsRemaining--;
char c = (char)nextScalar;
if (IsCharacterAllowed(c))
HtmlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
if (encoder == null)
{
builder.Append(c);
}
else
{
if (c == '<') { builder.Append("&lt;"); }
else if (c == '>') { builder.Append("&gt;"); }
else if (c == '&') { builder.Append("&amp;"); }
else if (c == '\"') { builder.Append("&quot;"); }
else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
encoder = new HtmlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
Volatile.Write(ref _basicLatinSingleton, encoder);
}
return encoder;
}
}
return builder.ToString();
}
// Determines whether the given character can be returned unencoded.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool IsCharacterAllowed(char c)
{
uint codePoint = (uint)c;
int index = (int)(codePoint >> 5);
int offset = (int)(codePoint & 0x1FU);
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
}
// Writes a scalar value as "&#xFFFFFFFF;"
private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
{
// We're building the characters up in reverse
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
int numCharsWritten = 0;
do
// Writes a scalar value as an HTML-encoded entity.
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
{
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
// Pop off the last nibble
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
value >>= 4;
} while (value != 0);
if (value == (uint)'\"') { builder.Append("&quot;"); }
else if (value == (uint)'&') { builder.Append("&amp;"); }
else if (value == (uint)'<') { builder.Append("&lt;"); }
else if (value == (uint)'>') { builder.Append("&gt;"); }
else { WriteEncodedScalarAsNumericEntity(builder, value); }
}
// Finally, write out the HTML-encoded scalar value.
builder.Append('&');
builder.Append('#');
builder.Append('x');
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
do
// Writes a scalar value as an HTML-encoded numeric entity.
private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
{
builder.Append(chars[--numCharsWritten]);
} while (numCharsWritten != 0);
builder.Append(';');
// We're building the characters up in reverse
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
int numCharsWritten = 0;
do
{
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
// Pop off the last nibble
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
value >>= 4;
} while (value != 0);
// Finally, write out the HTML-encoded scalar value.
builder.Append('&');
builder.Append('#');
builder.Append('x');
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
do
{
builder.Append(chars[--numCharsWritten]);
} while (numCharsWritten != 0);
builder.Append(';');
}
}
}
}