diff --git a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin index c9b36c871d..06530d73cf 100644 Binary files a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin and b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin differ diff --git a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs index 1984a61a53..6be7c93bbc 100644 --- a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs +++ b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs @@ -101,8 +101,7 @@ namespace Microsoft.Framework.WebEncoders bool actual = UnicodeHelpers.IsCharacterDefined((char)i); if (expected != actual) { - string message = String.Format(CultureInfo.InvariantCulture, "Character U+{0:X4}: expected = {1}, actual = {2}", i, expected, actual); - errors.Add(message); + errors.Add($"Character U+{i:X4}: expected = {expected}, actual = {actual}"); } } @@ -164,11 +163,18 @@ namespace Microsoft.Framework.WebEncoders { string[] splitLine = line.Split(';'); uint codePoint = UInt32.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); + string name = splitLine[1]; if (codePoint >= retVal.Length) { continue; // don't care about supplementary chars } + if (name.EndsWith(", First>", StringComparison.Ordinal) || name.EndsWith(", Last>", StringComparison.Ordinal)) + { + // ignore spans - we'll handle them separately + continue; + } + if (codePoint == (uint)' ') { retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char @@ -184,6 +190,24 @@ namespace Microsoft.Framework.WebEncoders } } + // Handle known spans from Unicode 7.0.0's UnicodeData.txt + + // CJK Ideograph Extension A + for (int i = '\u3400'; i <= '\u4DB5'; i++) + { + retVal[i] = true; + } + // CJK Ideograph + for (int i = '\u4E00'; i <= '\u9FCC'; i++) + { + retVal[i] = true; + } + // Hangul Syllable + for (int i = '\uAC00'; i <= '\uD7A3'; i++) + { + retVal[i] = true; + } + // Finally, we need to make sure we've seen every category which contains // allowed characters. This provides extra defense against having a typo // in the list of categories. diff --git a/unicode/Generators/DefinedCharListGenerator/Program.cs b/unicode/Generators/DefinedCharListGenerator/Program.cs index f98dd32f5b..edcd0f3602 100644 --- a/unicode/Generators/DefinedCharListGenerator/Program.cs +++ b/unicode/Generators/DefinedCharListGenerator/Program.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Diagnostics; using System.Globalization; using System.IO; @@ -19,6 +20,7 @@ namespace DefinedCharListGenerator const uint MAX_UNICODE_CHAR = 0x10FFFF; // Unicode range is U+0000 .. U+10FFFF bool[] definedChars = new bool[MAX_UNICODE_CHAR + 1]; + Dictionary spans = new Dictionary(); // Read all defined characters from the input file. string[] allLines = File.ReadAllLines("UnicodeData.txt"); @@ -28,11 +30,33 @@ namespace DefinedCharListGenerator foreach (string line in allLines) { string[] splitLine = line.Split(new char[] { ';' }, 4); + uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); + string rawName = splitLine[1]; + string category = splitLine[2]; + + // spans go into their own dictionary for later processing + string spanName; + bool isStartOfSpan; + if (IsSpanDefinition(rawName, out spanName, out isStartOfSpan)) + { + if (isStartOfSpan) + { + spans.Add(spanName, new Span() { FirstCodePoint = codepoint, Category = category }); + } + else + { + var existingSpan = spans[spanName]; + Debug.Assert(existingSpan.FirstCodePoint != 0, "We should've seen the start of this span already."); + Debug.Assert(existingSpan.LastCodePoint == 0, "We shouldn't have seen the end of this span already."); + Debug.Assert(existingSpan.Category == category, "Span start Unicode category doesn't match span end Unicode category."); + existingSpan.LastCodePoint = codepoint; + } + continue; + } // We only allow certain categories of code points. // Zs (space separators) aren't included, but we allow U+0020 SPACE as a special case - uint codepoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture); - string category = splitLine[2]; + if (!(codepoint == (uint)' ' || IsAllowedUnicodeCategory(category))) { continue; @@ -42,6 +66,21 @@ namespace DefinedCharListGenerator definedChars[codepoint] = true; } + // Next, populate characters that weren't defined on their own lines + // but which are instead defined as members of a named span. + foreach (var span in spans.Values) + { + if (IsAllowedUnicodeCategory(span.Category)) + { + Debug.Assert(span.FirstCodePoint <= MAX_UNICODE_CHAR); + Debug.Assert(span.LastCodePoint <= MAX_UNICODE_CHAR); + for (uint i = span.FirstCodePoint; i <= span.LastCodePoint; i++) + { + definedChars[i] = true; + } + } + } + // Finally, write the list of defined characters out as a bitmap. // Each consecutive block of 8 chars is written as a single byte. // For instance, the first byte of the output file contains the @@ -103,5 +142,39 @@ namespace DefinedCharListGenerator || category == "So" || category == "Cf"; /* other */ } + + private static bool IsSpanDefinition(string rawName, out string spanName, out bool isStartOfSpan) + { + // Spans are represented within angle brackets, such as the following: + // DC00;;Cs;0;L;;;;;N;;;;; + // DFFF;;Cs;0;L;;;;;N;;;;; + if (rawName.StartsWith("<", StringComparison.Ordinal)) + { + if (rawName.EndsWith(", First>", StringComparison.Ordinal)) + { + spanName = rawName.Substring(1, rawName.Length - 1 - ", First>".Length); + isStartOfSpan = true; + return true; + } + else if (rawName.EndsWith(", Last>", StringComparison.Ordinal)) + { + spanName = rawName.Substring(1, rawName.Length - 1 - ", Last>".Length); + isStartOfSpan = false; + return true; + } + } + + // not surrounded by <>, or or some other non-span + spanName = null; + isStartOfSpan = false; + return false; + } + + private class Span + { + public uint FirstCodePoint; + public uint LastCodePoint; + public string Category; + } } }