Fix CJK Ideographs and Hangul Syllables representation

Characters in these blocks weren't correctly identified as assigned characters, which caused the encoders to always encode them, even if the ranges were in the allow list.
This commit is contained in:
Levi B 2015-04-01 11:45:52 -07:00
parent 8da763a14a
commit 6c9055cadc
3 changed files with 101 additions and 4 deletions

View File

@ -101,8 +101,7 @@ namespace Microsoft.Framework.WebEncoders
bool actual = UnicodeHelpers.IsCharacterDefined((char)i);
if (expected != actual)
{
string message = String.Format(CultureInfo.InvariantCulture, "Character U+{0:X4}: expected = {1}, actual = {2}", i, expected, actual);
errors.Add(message);
errors.Add($"Character U+{i:X4}: expected = {expected}, actual = {actual}");
}
}
@ -164,11 +163,18 @@ namespace Microsoft.Framework.WebEncoders
{
string[] splitLine = line.Split(';');
uint codePoint = UInt32.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
string name = splitLine[1];
if (codePoint >= retVal.Length)
{
continue; // don't care about supplementary chars
}
if (name.EndsWith(", First>", StringComparison.Ordinal) || name.EndsWith(", Last>", StringComparison.Ordinal))
{
// ignore spans - we'll handle them separately
continue;
}
if (codePoint == (uint)' ')
{
retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
@ -184,6 +190,24 @@ namespace Microsoft.Framework.WebEncoders
}
}
// Handle known spans from Unicode 7.0.0's UnicodeData.txt
// CJK Ideograph Extension A
for (int i = '\u3400'; i <= '\u4DB5'; i++)
{
retVal[i] = true;
}
// CJK Ideograph
for (int i = '\u4E00'; i <= '\u9FCC'; i++)
{
retVal[i] = true;
}
// Hangul Syllable
for (int i = '\uAC00'; i <= '\uD7A3'; i++)
{
retVal[i] = true;
}
// Finally, we need to make sure we've seen every category which contains
// allowed characters. This provides extra defense against having a typo
// in the list of categories.

View File

@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
@ -19,6 +20,7 @@ namespace DefinedCharListGenerator
const uint MAX_UNICODE_CHAR = 0x10FFFF; // Unicode range is U+0000 .. U+10FFFF
bool[] definedChars = new bool[MAX_UNICODE_CHAR + 1];
Dictionary<string, Span> spans = new Dictionary<string, Span>();
// Read all defined characters from the input file.
string[] allLines = File.ReadAllLines("UnicodeData.txt");
@ -28,11 +30,33 @@ namespace DefinedCharListGenerator
foreach (string line in allLines)
{
string[] splitLine = line.Split(new char[] { ';' }, 4);
uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
string rawName = splitLine[1];
string category = splitLine[2];
// spans go into their own dictionary for later processing
string spanName;
bool isStartOfSpan;
if (IsSpanDefinition(rawName, out spanName, out isStartOfSpan))
{
if (isStartOfSpan)
{
spans.Add(spanName, new Span() { FirstCodePoint = codepoint, Category = category });
}
else
{
var existingSpan = spans[spanName];
Debug.Assert(existingSpan.FirstCodePoint != 0, "We should've seen the start of this span already.");
Debug.Assert(existingSpan.LastCodePoint == 0, "We shouldn't have seen the end of this span already.");
Debug.Assert(existingSpan.Category == category, "Span start Unicode category doesn't match span end Unicode category.");
existingSpan.LastCodePoint = codepoint;
}
continue;
}
// We only allow certain categories of code points.
// Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case
uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
string category = splitLine[2];
if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category)))
{
continue;
@ -42,6 +66,21 @@ namespace DefinedCharListGenerator
definedChars[codepoint] = true;
}
// Next, populate characters that weren't defined on their own lines
// but which are instead defined as members of a named span.
foreach (var span in spans.Values)
{
if (IsAllowedUnicodeCategory(span.Category))
{
Debug.Assert(span.FirstCodePoint <= MAX_UNICODE_CHAR);
Debug.Assert(span.LastCodePoint <= MAX_UNICODE_CHAR);
for (uint i = span.FirstCodePoint; i <= span.LastCodePoint; i++)
{
definedChars[i] = true;
}
}
}
// Finally, write the list of defined characters out as a bitmap.
// Each consecutive block of 8 chars is written as a single byte.
// For instance, the first byte of the output file contains the
@ -103,5 +142,39 @@ namespace DefinedCharListGenerator
|| category == "So"
|| category == "Cf"; /* other */
}
private static bool IsSpanDefinition(string rawName, out string spanName, out bool isStartOfSpan)
{
// Spans are represented within angle brackets, such as the following:
// DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
// DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
if (rawName.StartsWith("<", StringComparison.Ordinal))
{
if (rawName.EndsWith(", First>", StringComparison.Ordinal))
{
spanName = rawName.Substring(1, rawName.Length - 1 - ", First>".Length);
isStartOfSpan = true;
return true;
}
else if (rawName.EndsWith(", Last>", StringComparison.Ordinal))
{
spanName = rawName.Substring(1, rawName.Length - 1 - ", Last>".Length);
isStartOfSpan = false;
return true;
}
}
// not surrounded by <>, or <control> or some other non-span
spanName = null;
isStartOfSpan = false;
return false;
}
private class Span
{
public uint FirstCodePoint;
public uint LastCodePoint;
public string Category;
}
}
}