Update WebEncoders from Unicode 7.0 to Unicode 8.0

Add "how to update" file detailing update steps
2015-09-05 16:22:14 -07:00 · 2015-09-05 16:22:14 -07:00 · 43d0b0f65b
parent f3e828892d
commit 43d0b0f65b
10 changed files with 2189 additions and 123 deletions
--- a/src/Microsoft.Framework.WebEncoders.Core/UnicodeHelpers.cs
+++ b/src/Microsoft.Framework.WebEncoders.Core/UnicodeHelpers.cs
@ -28,14 +28,14 @@ namespace Microsoft.Framework.WebEncoders

        /// <summary>
        /// Helper method which creates a bitmap of all characters which are
-        /// defined per version 7.0.0 of the Unicode specification.
+        /// defined per version 8.0 of the Unicode specification.
        /// </summary>
        [MethodImpl(MethodImplOptions.NoInlining)]
        private static uint[] CreateDefinedCharacterBitmap()
        {
            // The stream should be exactly 8KB in size.
            var assembly = typeof(UnicodeHelpers).GetTypeInfo().Assembly;
-            var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-7.0.0-defined-characters.bin";
+            var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-defined-chars.bin";

            var stream = assembly.GetManifestResourceStream(resourceName);
            if (stream.Length != 8 * 1024)
@ -72,7 +72,7 @@ namespace Microsoft.Framework.WebEncoders
        }

        /// <summary>
-        /// Returns a bitmap of all characters which are defined per version 7.0.0
+        /// Returns a bitmap of all characters which are defined per version 8.0
        /// of the Unicode specification.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -204,7 +204,7 @@ namespace Microsoft.Framework.WebEncoders
        }

        /// <summary>
-        /// Returns a value stating whether a character is defined per version 7.0.0
+        /// Returns a value stating whether a character is defined per version 8.0
        /// of the Unicode specification. Certain classes of characters (control chars,
        /// private use, surrogates, some whitespace) are considered "undefined" for
        /// our purposes.
--- a/src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.cs
+++ b/src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.cs
@ -10,7 +10,7 @@ namespace Microsoft.Framework.WebEncoders
 {
    /// <summary>
    /// Contains predefined <see cref="UnicodeRange"/> instances which correspond to blocks
-    /// from the Unicode 7.0 specification.
+    /// from the Unicode 8.0 specification.
    /// </summary>
    public static partial class UnicodeRanges
    {
--- a/src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.generated.cs
+++ b/src/Microsoft.Framework.WebEncoders.Core/UnicodeRanges.generated.cs
@ -1277,6 +1277,15 @@ namespace Microsoft.Framework.WebEncoders
        public static UnicodeRange LatinExtendedE => Volatile.Read(ref _latinExtendedE) ?? CreateRange(ref _latinExtendedE, first: '\uAB30', last: '\uAB6F');
        private static UnicodeRange _latinExtendedE;

+        /// <summary>
+        /// A <see cref="UnicodeRange"/> corresponding to the 'Cherokee Supplement' Unicode block (U+AB70..U+ABBF).
+        /// </summary>
+        /// <remarks>
+        /// See http://www.unicode.org/charts/PDF/UAB70.pdf for the full set of characters in this block.
+        /// </remarks>
+        public static UnicodeRange CherokeeSupplement => Volatile.Read(ref _cherokeeSupplement) ?? CreateRange(ref _cherokeeSupplement, first: '\uAB70', last: '\uABBF');
+        private static UnicodeRange _cherokeeSupplement;
+
        /// <summary>
        /// A <see cref="UnicodeRange"/> corresponding to the 'Meetei Mayek' Unicode block (U+ABC0..U+ABFF).
        /// </summary>
@ -1303,7 +1312,7 @@ namespace Microsoft.Framework.WebEncoders
        /// </remarks>
        public static UnicodeRange HangulJamoExtendedB => Volatile.Read(ref _hangulJamoExtendedB) ?? CreateRange(ref _hangulJamoExtendedB, first: '\uD7B0', last: '\uD7FF');
        private static UnicodeRange _hangulJamoExtendedB;
-        
+
        /// <summary>
        /// A <see cref="UnicodeRange"/> corresponding to the 'CJK Compatibility Ideographs' Unicode block (U+F900..U+FAFF).
        /// </summary>
--- a/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin
+++ b/src/Microsoft.Framework.WebEncoders.Core/compiler/resources/unicode-7.0.0-defined-characters.bin
--- a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
+++ b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeHelpersTests.cs
@ -193,7 +193,7 @@ namespace Microsoft.Framework.WebEncoders
                }
            }

-            // Handle known spans from Unicode 7.0.0's UnicodeData.txt
+            // Handle known spans from Unicode 8.0's UnicodeData.txt

            // CJK Ideograph Extension A
            for (int i = '\u3400'; i <= '\u4DB5'; i++)
@ -201,7 +201,7 @@ namespace Microsoft.Framework.WebEncoders
                retVal[i] = true;
            }
            // CJK Ideograph
-            for (int i = '\u4E00'; i <= '\u9FCC'; i++)
+            for (int i = '\u4E00'; i <= '\u9FD5'; i++)
            {
                retVal[i] = true;
            }
--- a/test/Microsoft.Framework.WebEncoders.Tests/UnicodeRangesTests.cs
+++ b/test/Microsoft.Framework.WebEncoders.Tests/UnicodeRangesTests.cs
@ -172,6 +172,7 @@ namespace Microsoft.Framework.WebEncoders
        [InlineData('\uAAE0', '\uAAFF', nameof(UnicodeRanges.MeeteiMayekExtensions))]
        [InlineData('\uAB00', '\uAB2F', nameof(UnicodeRanges.EthiopicExtendedA))]
        [InlineData('\uAB30', '\uAB6F', nameof(UnicodeRanges.LatinExtendedE))]
+        [InlineData('\uAB70', '\uABBF', nameof(UnicodeRanges.CherokeeSupplement))]
        [InlineData('\uABC0', '\uABFF', nameof(UnicodeRanges.MeeteiMayek))]
        [InlineData('\uAC00', '\uD7AF', nameof(UnicodeRanges.HangulSyllables))]
        [InlineData('\uD7B0', '\uD7FF', nameof(UnicodeRanges.HangulJamoExtendedB))]
--- a/unicode/Blocks.txt
+++ b/unicode/Blocks.txt
@ -1,14 +1,11 @@
-# Blocks-7.0.0.txt
-# Date: 2014-04-03, 23:23:00 GMT [RP, KW]
+# Blocks-8.0.0.txt
+# Date: 2014-11-10, 23:04:00 GMT [KW]
 #
 # Unicode Character Database
 # Copyright (c) 1991-2014 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see http://www.unicode.org/reports/tr44/
 #
-# Note:   The casing of block names is not normative.
-#         For example, "Basic Latin" and "BASIC LATIN" are equivalent.
-#
 # Format:
 # Start Code..End Code; Block Name

@ -20,6 +17,14 @@
 #         For more information on the comparison of property values, 
 #            see UAX #44: http://www.unicode.org/reports/tr44/
 #
+#  All block ranges start with a value where (cp MOD 16) = 0,
+#  and end with a value where (cp MOD 16) = 15. In other words,
+#  the last hexadecimal digit of the start of range is ...0 
+#  and the last hexadecimal digit of the end of range is ...F.
+#  This constraint on block ranges guarantees that allocations
+#  are done in terms of whole columns, and that code chart display
+#  never involves splitting columns in the charts.
+#
 #  All code points not explicitly listed for Block
 #  have the value No_Block.

@ -168,6 +173,7 @@ AA80..AADF; Tai Viet
 AAE0..AAFF; Meetei Mayek Extensions
 AB00..AB2F; Ethiopic Extended-A
 AB30..AB6F; Latin Extended-E
+AB70..ABBF; Cherokee Supplement
 ABC0..ABFF; Meetei Mayek
 AC00..D7AF; Hangul Syllables
 D7B0..D7FF; Hangul Jamo Extended-B
@ -210,6 +216,7 @@ FFF0..FFFF; Specials
 10840..1085F; Imperial Aramaic
 10860..1087F; Palmyrene
 10880..108AF; Nabataean
+108E0..108FF; Hatran
 10900..1091F; Phoenician
 10920..1093F; Lydian
 10980..1099F; Meroitic Hieroglyphs
@ -223,6 +230,7 @@ FFF0..FFFF; Specials
 10B60..10B7F; Inscriptional Pahlavi
 10B80..10BAF; Psalter Pahlavi
 10C00..10C4F; Old Turkic
+10C80..10CFF; Old Hungarian
 10E60..10E7F; Rumi Numeral Symbols
 11000..1107F; Brahmi
 11080..110CF; Kaithi
@ -232,17 +240,21 @@ FFF0..FFFF; Specials
 11180..111DF; Sharada
 111E0..111FF; Sinhala Archaic Numbers
 11200..1124F; Khojki
+11280..112AF; Multani
 112B0..112FF; Khudawadi
 11300..1137F; Grantha
 11480..114DF; Tirhuta
 11580..115FF; Siddham
 11600..1165F; Modi
 11680..116CF; Takri
+11700..1173F; Ahom
 118A0..118FF; Warang Citi
 11AC0..11AFF; Pau Cin Hau
 12000..123FF; Cuneiform
 12400..1247F; Cuneiform Numbers and Punctuation
+12480..1254F; Early Dynastic Cuneiform
 13000..1342F; Egyptian Hieroglyphs
+14400..1467F; Anatolian Hieroglyphs
 16800..16A3F; Bamum Supplement
 16A40..16A6F; Mro
 16AD0..16AFF; Bassa Vah
@ -257,6 +269,7 @@ FFF0..FFFF; Specials
 1D300..1D35F; Tai Xuan Jing Symbols
 1D360..1D37F; Counting Rod Numerals
 1D400..1D7FF; Mathematical Alphanumeric Symbols
+1D800..1DAAF; Sutton SignWriting
 1E800..1E8DF; Mende Kikakui
 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
 1F000..1F02F; Mahjong Tiles
@ -271,9 +284,11 @@ FFF0..FFFF; Specials
 1F700..1F77F; Alchemical Symbols
 1F780..1F7FF; Geometric Shapes Extended
 1F800..1F8FF; Supplemental Arrows-C
+1F900..1F9FF; Supplemental Symbols and Pictographs
 20000..2A6DF; CJK Unified Ideographs Extension B
 2A700..2B73F; CJK Unified Ideographs Extension C
 2B740..2B81F; CJK Unified Ideographs Extension D
+2B820..2CEAF; CJK Unified Ideographs Extension E
 2F800..2FA1F; CJK Compatibility Ideographs Supplement
 E0000..E007F; Tags
 E0100..E01EF; Variation Selectors Supplement
--- a/unicode/UnicodeData.txt
+++ b/unicode/UnicodeData.txt
--- a/unicode/how-to-update.txt
+++ b/unicode/how-to-update.txt
@ -0,0 +1,94 @@
+This document contains instructions for updating the Unicode data set used by
+the WebEncoders project.
+
+1) Download the latest UnicodeData.txt and Blocks.txt from the Unicode
+   Consortium web site. These files are normally found under
+   http://www.unicode.org/Public/X.Y.Z/ucd/, where X.Y.Z is the version of the
+   Unicode specification of interest. Replace the UnicodeData.txt and
+   Blocks.txt files in this folder with the files you downloaded.
+
+2) Update unicode-copyright.txt in this folder with the following information:
+   - The exact URLs where you downloaded UnicodeData.txt and Blocks.txt.
+   - The date on which you downloaded these two files.
+   - The Unicode copyright and permission notice, if it has changed. The latest
+     copyright and permission notice can be found at the bottom of
+	 http://www.unicode.org/copyright.html.
+
+3) Open the Generators solution and run the DefinedCharListGenerator project.
+   Running this will drop a file unicode-defined-chars.bin into the output
+   folder. Move this file into the following directory, overwriting the
+   existing file in that directory:
+   src\Microsoft.Framework.WebEncoders.Core\compiler\resources
+
+4) Open the Generators solution and run the UnicodeTablesGenerator project.
+   Running this will drop two files UnicodeRanges.generated.txt and
+   UnicodeRangesTests.generated.txt into the output folder.
+
+5) Open UnicodeRanges.generated.txt in your favorite text editor. You'll see
+   that the file contains all of the parsed Unicode block information in
+   ascending code point order. Manually REMOVE the following blocks from this
+   text file and re-save it.
+   - High Surrogates (U+D800..U+DB7F)
+   - High Private Use Surrogates (U+DB80..U+DBFF)
+   - Low Surrogates (U+DC00..U+DFFF)
+   - Private Use Area (U+E000..U+F8FF)
+
+6) Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.generated.cs in
+   your IDE. Delete everything within the partial class definition and replace
+   it with the contents of UnicodeRanges.generated.txt. (Remember to remove
+   the blocks mentioned in the previous step, otherwise unit tests will fail.)
+
+   Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.cs in your IDE.
+   Update the doc comment at the top of the class to reflect the appropriate
+   version of the Unicode specification.
+
+7) Open UnicodeRangesTests.generated.txt in your favorite text editor. Just
+   like in the previous .txt file, you'll need to remove the [InlineData]
+   lines which map to the Unicode blocks which were manually removed.
+   See step (5) for the list of which blocks must be removed. Then re-save
+   this file.
+
+8) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeRangesTests.cs in
+   your IDE. Delete all of the [InlineData] attributes on the Range_Unicode
+   test, then paste the contents of UnicodeRangesTests.generated.txt in
+   to restore the new [InlineData] list.
+
+   IMPORTANT: Don't delete the [Theory] attribute on this method!
+
+9) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeHelpersTests.cs in
+   your IDE. Scroll to the bottom of the ReadListOfDefinedCharacters method,
+   and you'll see a section where the test special-cases CJK Ideographs and
+   Hangul Syllables. As more characters are added to the Unicode specification
+   the list of valid CJK Ideographs and Hangul Syllables can grow, so make sure
+   these match up with the relevant lines in UnicodeData.txt. For instance, at
+   the time of this writing UnicodeData.txt lists the valid Hangul Syllable
+   character range as follows:
+
+     AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+     D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+
+   If necessary, update the logic in the ReadListOfDefinedCharacters method to
+   account for any changes to these lines in UnicodeData.txt.
+
+That's it! Run the unit tests and everything should be good to go. If you find
+any stray comments throughout the code base that reference a specific version
+of the Unicode specification, go ahead and update them so that they correctly
+reflect the version you just submitted.
+
+To recap, the files you should check in are:
+
+src\Microsoft.Framework.WebEncoders.Core\compiler\resources\
+  unicode-defined-chars.bin
+
+src\Microsoft.Framework.WebEncoders.Core\
+  UnicodeRanges.cs
+  UnicodeRanges.generated.cs
+
+test\Microsoft.Framework.WebEncoders.Tests\
+  UnicodeHelpersTests.cs (if necessary, see step 9)
+  UnicodeRangesTests.cs
+
+unicode\
+  Blocks.txt
+  unicode-copyright.txt
+  UnicodeData.txt
--- a/unicode/unicode-copyright.txt
+++ b/unicode/unicode-copyright.txt
@ -1,8 +1,8 @@
 The files Blocks.txt and UnicodeData.txt in this directory were
-retrieved from the following URLs on Saturday, February 7, 2015.
+retrieved from the following URLs on Saturday, September 5, 2015.

-http://www.unicode.org/Public/7.0.0/ucd/Blocks.txt
-http://www.unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+http://www.unicode.org/Public/8.0.0/ucd/Blocks.txt
+http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt

 The below copyright notice applies to these files.