aspnetcore/src/Microsoft.Framework.WebEnco.../UnicodeEncoderBase.cs

302 lines
12 KiB
C#

// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Text;
namespace Microsoft.Framework.WebEncoders
{
internal unsafe abstract class UnicodeEncoderBase
{
// A bitmap of characters which are allowed to be returned unescaped.
private AllowedCharsBitmap _allowedCharsBitmap;
// The worst-case number of output chars generated for any input char.
private readonly int _maxOutputCharsPerInputChar;
/// <summary>
/// Instantiates an encoder using a custom allow list of characters.
/// </summary>
protected UnicodeEncoderBase(CodePointFilter filter, int maxOutputCharsPerInputChar)
{
_maxOutputCharsPerInputChar = maxOutputCharsPerInputChar;
_allowedCharsBitmap = filter.GetAllowedCharsBitmap();
// Forbid characters that are special in HTML.
// Even though this is a common encoder used by everybody (including URL
// and JavaScript strings), it's unfortunately common for developers to
// forget to HTML-encode a string once it has been URL-encoded or
// JavaScript string-escaped, so this offers extra protection.
ForbidCharacter('<');
ForbidCharacter('>');
ForbidCharacter('&');
ForbidCharacter('\''); // can be used to escape attributes
ForbidCharacter('\"'); // can be used to escape attributes
ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
// Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
// (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp)
_allowedCharsBitmap.ForbidUndefinedCharacters();
}
// Marks a character as forbidden (must be returned encoded)
protected void ForbidCharacter(char c)
{
_allowedCharsBitmap.ForbidCharacter(c);
}
/// <summary>
/// Entry point to the encoder.
/// </summary>
public void Encode(char[] value, int startIndex, int charCount, TextWriter output)
{
// Input checking
if (value == null)
{
throw new ArgumentNullException(nameof(value));
}
if (output == null)
{
throw new ArgumentNullException(nameof(output));
}
ValidateInputs(startIndex, charCount, actualInputLength: value.Length);
if (charCount != 0)
{
fixed (char* pChars = value)
{
int indexOfFirstCharWhichRequiresEncoding = GetIndexOfFirstCharWhichRequiresEncoding(&pChars[startIndex], charCount);
if (indexOfFirstCharWhichRequiresEncoding < 0)
{
// All chars are valid - just copy the buffer as-is.
output.Write(value, startIndex, charCount);
}
else
{
// Flush all chars which are known to be valid, then encode the remainder individually
if (indexOfFirstCharWhichRequiresEncoding > 0)
{
output.Write(value, startIndex, indexOfFirstCharWhichRequiresEncoding);
}
EncodeCore(&pChars[startIndex + indexOfFirstCharWhichRequiresEncoding], (uint)(charCount - indexOfFirstCharWhichRequiresEncoding), output);
}
}
}
}
/// <summary>
/// Entry point to the encoder.
/// </summary>
public string Encode(string value)
{
if (String.IsNullOrEmpty(value))
{
return value;
}
// Quick check: does the string need to be encoded at all?
// If not, just return the input string as-is.
for (int i = 0; i < value.Length; i++)
{
if (!IsCharacterAllowed(value[i]))
{
return EncodeCore(value, idxOfFirstCharWhichRequiresEncoding: i);
}
}
return value;
}
/// <summary>
/// Entry point to the encoder.
/// </summary>
public void Encode(string value, int startIndex, int charCount, TextWriter output)
{
// Input checking
if (value == null)
{
throw new ArgumentNullException(nameof(value));
}
if (output == null)
{
throw new ArgumentNullException(nameof(output));
}
ValidateInputs(startIndex, charCount, actualInputLength: value.Length);
if (charCount != 0)
{
fixed (char* pChars = value)
{
if (charCount == value.Length)
{
// Optimize for the common case: we're being asked to encode the entire input string
// (not just a subset). If all characters are safe, we can just spit it out as-is.
int indexOfFirstCharWhichRequiresEncoding = GetIndexOfFirstCharWhichRequiresEncoding(pChars, charCount);
if (indexOfFirstCharWhichRequiresEncoding < 0)
{
output.Write(value);
}
else
{
// Flush all chars which are known to be valid, then encode the remainder individually
for (int i = 0; i < indexOfFirstCharWhichRequiresEncoding; i++)
{
output.Write(pChars[i]);
}
EncodeCore(&pChars[indexOfFirstCharWhichRequiresEncoding], (uint)(charCount - indexOfFirstCharWhichRequiresEncoding), output);
}
}
else
{
// We're being asked to encode a subset, so we need to go through the slow path of appending
// each character individually.
EncodeCore(&pChars[startIndex], (uint)charCount, output);
}
}
}
}
private string EncodeCore(string input, int idxOfFirstCharWhichRequiresEncoding)
{
Debug.Assert(idxOfFirstCharWhichRequiresEncoding >= 0);
Debug.Assert(idxOfFirstCharWhichRequiresEncoding < input.Length);
int numCharsWhichMayRequireEncoding = input.Length - idxOfFirstCharWhichRequiresEncoding;
int sbCapacity = checked(idxOfFirstCharWhichRequiresEncoding + EncoderCommon.GetCapacityOfOutputStringBuilder(numCharsWhichMayRequireEncoding, _maxOutputCharsPerInputChar));
Debug.Assert(sbCapacity >= input.Length);
// Allocate the StringBuilder with the first (known to not require encoding) part of the input string,
// then begin encoding from the last (potentially requiring encoding) part of the input string.
StringBuilder builder = new StringBuilder(input, 0, idxOfFirstCharWhichRequiresEncoding, sbCapacity);
Writer writer = new Writer(builder);
fixed (char* pInput = input)
{
EncodeCore(ref writer, &pInput[idxOfFirstCharWhichRequiresEncoding], (uint)numCharsWhichMayRequireEncoding);
}
return builder.ToString();
}
private void EncodeCore(char* input, uint charsRemaining, TextWriter output)
{
Writer writer = new Writer(output);
EncodeCore(ref writer, input, charsRemaining);
}
private void EncodeCore(ref Writer writer, char* input, uint charsRemaining)
{
while (charsRemaining != 0)
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1));
if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar))
{
// Supplementary characters should always be encoded numerically.
WriteEncodedScalar(ref writer, (uint)nextScalar);
// We consume two UTF-16 characters for a single supplementary character.
input += 2;
charsRemaining -= 2;
}
else
{
// Otherwise, this was a BMP character.
input++;
charsRemaining--;
char c = (char)nextScalar;
if (IsCharacterAllowed(c))
{
writer.Write(c);
}
else
{
WriteEncodedScalar(ref writer, (uint)nextScalar);
}
}
}
}
private int GetIndexOfFirstCharWhichRequiresEncoding(char* input, int inputLength)
{
for (int i = 0; i < inputLength; i++)
{
if (!IsCharacterAllowed(input[i]))
{
return i;
}
}
return -1; // no characters require encoding
}
// Determines whether the given character can be returned unencoded.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool IsCharacterAllowed(char c)
{
return _allowedCharsBitmap.IsCharacterAllowed(c);
}
private static void ValidateInputs(int startIndex, int charCount, int actualInputLength)
{
if (startIndex < 0 || startIndex > actualInputLength)
{
throw new ArgumentOutOfRangeException(nameof(startIndex));
}
if (charCount < 0 || charCount > (actualInputLength - startIndex))
{
throw new ArgumentOutOfRangeException(nameof(charCount));
}
}
protected abstract void WriteEncodedScalar(ref Writer writer, uint value);
/// <summary>
/// Provides an abstraction over both StringBuilder and TextWriter.
/// Declared as a struct so we can allocate on the stack and pass by
/// reference. Eliminates chatty virtual dispatches on hot paths.
/// </summary>
protected struct Writer
{
private readonly StringBuilder _innerBuilder;
private readonly TextWriter _innerWriter;
public Writer(StringBuilder innerBuilder)
{
_innerBuilder = innerBuilder;
_innerWriter = null;
}
public Writer(TextWriter innerWriter)
{
_innerBuilder = null;
_innerWriter = innerWriter;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Write(char value)
{
if (_innerBuilder != null)
{
_innerBuilder.Append(value);
}
else
{
_innerWriter.Write(value);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Write(string value)
{
if (_innerBuilder != null)
{
_innerBuilder.Append(value);
}
else
{
_innerWriter.Write(value);
}
}
}
}
}