Update HtmlEncoder to wrap UnicodeEncoderBase
This commit is contained in:
parent
fc7ed3a9cd
commit
14c872d981
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
||||
|
|
@ -21,14 +20,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
|
|||
// The default HtmlEncoder (Basic Latin), instantiated on demand
|
||||
private static HtmlEncoder _defaultEncoder;
|
||||
|
||||
// A bitmap of characters which are allowed to be returned unescaped.
|
||||
private readonly uint[] _allowedCharsBitmap = new uint[0x10000 / 32];
|
||||
// The inner encoder, responsible for the actual encoding routines
|
||||
private readonly HtmlUnicodeEncoder _innerUnicodeEncoder;
|
||||
|
||||
/// <summary>
|
||||
/// Instantiates an encoder using the 'Basic Latin' code table as the allow list.
|
||||
/// </summary>
|
||||
public HtmlEncoder()
|
||||
: this(CodePointFilters.BasicLatin)
|
||||
: this(HtmlUnicodeEncoder.BasicLatin)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
@ -36,41 +35,14 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
|
|||
/// Instantiates an encoder using a custom allow list of characters.
|
||||
/// </summary>
|
||||
public HtmlEncoder(params ICodePointFilter[] filters)
|
||||
: this(new HtmlUnicodeEncoder(filters))
|
||||
{
|
||||
if (filters == null)
|
||||
{
|
||||
return; // no characters are allowed, just no-op immediately
|
||||
}
|
||||
}
|
||||
|
||||
// Punch a hole for each allowed code point across all filters (this is an OR).
|
||||
// We don't allow supplementary (astral) characters for now.
|
||||
foreach (var filter in filters)
|
||||
{
|
||||
foreach (var codePoint in filter.GetAllowedCodePoints())
|
||||
{
|
||||
if (!UnicodeHelpers.IsSupplementaryCodePoint(codePoint))
|
||||
{
|
||||
AllowCharacter((char)codePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Forbid characters that are special in HTML
|
||||
ForbidCharacter('<');
|
||||
ForbidCharacter('>');
|
||||
ForbidCharacter('&');
|
||||
ForbidCharacter('\''); // can be used to escape attributes
|
||||
ForbidCharacter('\"'); // can be used to escape attributes
|
||||
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
|
||||
|
||||
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
|
||||
// (includes categories Cc, Cs, Co, Cn, Zl, Zp)
|
||||
uint[] definedCharactersBitmap = UnicodeHelpers.GetDefinedCharacterBitmap();
|
||||
Debug.Assert(definedCharactersBitmap.Length == _allowedCharsBitmap.Length);
|
||||
for (int i = 0; i < _allowedCharsBitmap.Length; i++)
|
||||
{
|
||||
_allowedCharsBitmap[i] &= definedCharactersBitmap[i];
|
||||
}
|
||||
private HtmlEncoder(HtmlUnicodeEncoder innerEncoder)
|
||||
{
|
||||
Debug.Assert(innerEncoder != null);
|
||||
_innerUnicodeEncoder = innerEncoder;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
@ -91,139 +63,78 @@ namespace Microsoft.AspNet.WebUtilities.Encoders
|
|||
}
|
||||
}
|
||||
|
||||
// Marks a character as allowed (can be returned unencoded)
|
||||
private void AllowCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] |= 0x1U << offset;
|
||||
}
|
||||
|
||||
// Marks a character as forbidden (must be returned encoded)
|
||||
private void ForbidCharacter(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
_allowedCharsBitmap[index] &= ~(0x1U << offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Everybody's favorite HtmlEncode routine.
|
||||
/// </summary>
|
||||
public string HtmlEncode(string value)
|
||||
{
|
||||
if (String.IsNullOrEmpty(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
// Quick check: does the string need to be encoded at all?
|
||||
// If not, just return the input string as-is.
|
||||
for (int i = 0; i < value.Length; i++)
|
||||
{
|
||||
if (!IsCharacterAllowed(value[i]))
|
||||
{
|
||||
return HtmlEncodeImpl(value, i);
|
||||
}
|
||||
}
|
||||
return value;
|
||||
return _innerUnicodeEncoder.Encode(value);
|
||||
}
|
||||
|
||||
private string HtmlEncodeImpl(string input, int idxOfFirstCharWhichRequiresEncoding)
|
||||
private sealed class HtmlUnicodeEncoder : UnicodeEncoderBase
|
||||
{
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
|
||||
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
|
||||
// A singleton instance of the basic latin encoder.
|
||||
private static HtmlUnicodeEncoder _basicLatinSingleton;
|
||||
|
||||
// The worst case encoding is 8 output chars per input char: [input] U+FFFF -> [output] ""
|
||||
// We don't need to worry about astral code points since they consume *two* input chars to
|
||||
// generate at most 10 output chars (""), which equates to 5 output per input.
|
||||
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
|
||||
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, worstCaseOutputCharsPerInputChar: 8));
|
||||
Debug.Assert(sbCapacity >= input.Length);
|
||||
// generate at most 10 output chars (""), which equates to 5 output chars per input char.
|
||||
private const int MaxOutputCharsPerInputChar = 8;
|
||||
|
||||
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
|
||||
// then begin encoding from the last (potentially requiring encoding) part of the input string.
|
||||
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
|
||||
fixed (char* pInput = input)
|
||||
internal HtmlUnicodeEncoder(ICodePointFilter[] filters)
|
||||
: base(filters, MaxOutputCharsPerInputChar)
|
||||
{
|
||||
return HtmlEncodeImpl2(builder, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
|
||||
}
|
||||
}
|
||||
|
||||
private string HtmlEncodeImpl2(StringBuilder builder, char* input, uint charsRemaining)
|
||||
{
|
||||
while (charsRemaining != 0)
|
||||
internal static HtmlUnicodeEncoder BasicLatin
|
||||
{
|
||||
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
|
||||
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
|
||||
get
|
||||
{
|
||||
// Supplementary characters should always be encoded numerically.
|
||||
WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar);
|
||||
|
||||
// We consume two UTF-16 characters for a single supplementary character.
|
||||
input += 2;
|
||||
charsRemaining -= 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, this was a BMP character.
|
||||
input++;
|
||||
charsRemaining--;
|
||||
char c = (char)nextScalar;
|
||||
if (IsCharacterAllowed(c))
|
||||
HtmlUnicodeEncoder encoder = Volatile.Read(ref _basicLatinSingleton);
|
||||
if (encoder == null)
|
||||
{
|
||||
builder.Append(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c == '<') { builder.Append("<"); }
|
||||
else if (c == '>') { builder.Append(">"); }
|
||||
else if (c == '&') { builder.Append("&"); }
|
||||
else if (c == '\"') { builder.Append("""); }
|
||||
else { WriteScalarAsHtmlEncodedEntity(builder, (uint)nextScalar); }
|
||||
encoder = new HtmlUnicodeEncoder(new[] { CodePointFilters.BasicLatin });
|
||||
Volatile.Write(ref _basicLatinSingleton, encoder);
|
||||
}
|
||||
return encoder;
|
||||
}
|
||||
}
|
||||
|
||||
return builder.ToString();
|
||||
}
|
||||
|
||||
// Determines whether the given character can be returned unencoded.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool IsCharacterAllowed(char c)
|
||||
{
|
||||
uint codePoint = (uint)c;
|
||||
int index = (int)(codePoint >> 5);
|
||||
int offset = (int)(codePoint & 0x1FU);
|
||||
return ((_allowedCharsBitmap[index] >> offset) & 0x1U) != 0;
|
||||
}
|
||||
|
||||
// Writes a scalar value as "�"
|
||||
private static void WriteScalarAsHtmlEncodedEntity(StringBuilder builder, uint value)
|
||||
{
|
||||
// We're building the characters up in reverse
|
||||
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
|
||||
int numCharsWritten = 0;
|
||||
do
|
||||
// Writes a scalar value as an HTML-encoded entity.
|
||||
protected override void WriteEncodedScalar(StringBuilder builder, uint value)
|
||||
{
|
||||
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
|
||||
// Pop off the last nibble
|
||||
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
|
||||
value >>= 4;
|
||||
} while (value != 0);
|
||||
if (value == (uint)'\"') { builder.Append("""); }
|
||||
else if (value == (uint)'&') { builder.Append("&"); }
|
||||
else if (value == (uint)'<') { builder.Append("<"); }
|
||||
else if (value == (uint)'>') { builder.Append(">"); }
|
||||
else { WriteEncodedScalarAsNumericEntity(builder, value); }
|
||||
}
|
||||
|
||||
// Finally, write out the HTML-encoded scalar value.
|
||||
builder.Append('&');
|
||||
builder.Append('#');
|
||||
builder.Append('x');
|
||||
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
|
||||
do
|
||||
// Writes a scalar value as an HTML-encoded numeric entity.
|
||||
private static void WriteEncodedScalarAsNumericEntity(StringBuilder builder, uint value)
|
||||
{
|
||||
builder.Append(chars[--numCharsWritten]);
|
||||
} while (numCharsWritten != 0);
|
||||
builder.Append(';');
|
||||
// We're building the characters up in reverse
|
||||
char* chars = stackalloc char[8 /* "FFFFFFFF" */];
|
||||
int numCharsWritten = 0;
|
||||
do
|
||||
{
|
||||
Debug.Assert(numCharsWritten < 8, "Couldn't have written 8 characters out by this point.");
|
||||
// Pop off the last nibble
|
||||
chars[numCharsWritten++] = HexUtil.IntToChar(value & 0xFU);
|
||||
value >>= 4;
|
||||
} while (value != 0);
|
||||
|
||||
// Finally, write out the HTML-encoded scalar value.
|
||||
builder.Append('&');
|
||||
builder.Append('#');
|
||||
builder.Append('x');
|
||||
Debug.Assert(numCharsWritten > 0, "At least one character should've been written.");
|
||||
do
|
||||
{
|
||||
builder.Append(chars[--numCharsWritten]);
|
||||
} while (numCharsWritten != 0);
|
||||
builder.Append(';');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue