Update WebEncoders from Unicode 7.0 to Unicode 8.0

Add "how to update" file detailing update steps
This commit is contained in:
Levi B 2015-09-05 16:22:14 -07:00
parent f3e828892d
commit 43d0b0f65b
10 changed files with 2189 additions and 123 deletions

View File

@ -28,14 +28,14 @@ namespace Microsoft.Framework.WebEncoders
/// <summary>
/// Helper method which creates a bitmap of all characters which are
/// defined per version 7.0.0 of the Unicode specification.
/// defined per version 8.0 of the Unicode specification.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
private static uint[] CreateDefinedCharacterBitmap()
{
// The stream should be exactly 8KB in size.
var assembly = typeof(UnicodeHelpers).GetTypeInfo().Assembly;
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-7.0.0-defined-characters.bin";
var resourceName = assembly.GetName().Name + ".compiler.resources.unicode-defined-chars.bin";
var stream = assembly.GetManifestResourceStream(resourceName);
if (stream.Length != 8 * 1024)
@ -72,7 +72,7 @@ namespace Microsoft.Framework.WebEncoders
}
/// <summary>
/// Returns a bitmap of all characters which are defined per version 7.0.0
/// Returns a bitmap of all characters which are defined per version 8.0
/// of the Unicode specification.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -204,7 +204,7 @@ namespace Microsoft.Framework.WebEncoders
}
/// <summary>
/// Returns a value stating whether a character is defined per version 7.0.0
/// Returns a value stating whether a character is defined per version 8.0
/// of the Unicode specification. Certain classes of characters (control chars,
/// private use, surrogates, some whitespace) are considered "undefined" for
/// our purposes.

View File

@ -10,7 +10,7 @@ namespace Microsoft.Framework.WebEncoders
{
/// <summary>
/// Contains predefined <see cref="UnicodeRange"/> instances which correspond to blocks
/// from the Unicode 7.0 specification.
/// from the Unicode 8.0 specification.
/// </summary>
public static partial class UnicodeRanges
{

View File

@ -1277,6 +1277,15 @@ namespace Microsoft.Framework.WebEncoders
public static UnicodeRange LatinExtendedE => Volatile.Read(ref _latinExtendedE) ?? CreateRange(ref _latinExtendedE, first: '\uAB30', last: '\uAB6F');
private static UnicodeRange _latinExtendedE;
/// <summary>
/// A <see cref="UnicodeRange"/> corresponding to the 'Cherokee Supplement' Unicode block (U+AB70..U+ABBF).
/// </summary>
/// <remarks>
/// See http://www.unicode.org/charts/PDF/UAB70.pdf for the full set of characters in this block.
/// </remarks>
public static UnicodeRange CherokeeSupplement => Volatile.Read(ref _cherokeeSupplement) ?? CreateRange(ref _cherokeeSupplement, first: '\uAB70', last: '\uABBF');
private static UnicodeRange _cherokeeSupplement;
/// <summary>
/// A <see cref="UnicodeRange"/> corresponding to the 'Meetei Mayek' Unicode block (U+ABC0..U+ABFF).
/// </summary>
@ -1303,7 +1312,7 @@ namespace Microsoft.Framework.WebEncoders
/// </remarks>
public static UnicodeRange HangulJamoExtendedB => Volatile.Read(ref _hangulJamoExtendedB) ?? CreateRange(ref _hangulJamoExtendedB, first: '\uD7B0', last: '\uD7FF');
private static UnicodeRange _hangulJamoExtendedB;
/// <summary>
/// A <see cref="UnicodeRange"/> corresponding to the 'CJK Compatibility Ideographs' Unicode block (U+F900..U+FAFF).
/// </summary>

View File

@ -193,7 +193,7 @@ namespace Microsoft.Framework.WebEncoders
}
}
// Handle known spans from Unicode 7.0.0's UnicodeData.txt
// Handle known spans from Unicode 8.0's UnicodeData.txt
// CJK Ideograph Extension A
for (int i = '\u3400'; i <= '\u4DB5'; i++)
@ -201,7 +201,7 @@ namespace Microsoft.Framework.WebEncoders
retVal[i] = true;
}
// CJK Ideograph
for (int i = '\u4E00'; i <= '\u9FCC'; i++)
for (int i = '\u4E00'; i <= '\u9FD5'; i++)
{
retVal[i] = true;
}

View File

@ -172,6 +172,7 @@ namespace Microsoft.Framework.WebEncoders
[InlineData('\uAAE0', '\uAAFF', nameof(UnicodeRanges.MeeteiMayekExtensions))]
[InlineData('\uAB00', '\uAB2F', nameof(UnicodeRanges.EthiopicExtendedA))]
[InlineData('\uAB30', '\uAB6F', nameof(UnicodeRanges.LatinExtendedE))]
[InlineData('\uAB70', '\uABBF', nameof(UnicodeRanges.CherokeeSupplement))]
[InlineData('\uABC0', '\uABFF', nameof(UnicodeRanges.MeeteiMayek))]
[InlineData('\uAC00', '\uD7AF', nameof(UnicodeRanges.HangulSyllables))]
[InlineData('\uD7B0', '\uD7FF', nameof(UnicodeRanges.HangulJamoExtendedB))]

View File

@ -1,14 +1,11 @@
# Blocks-7.0.0.txt
# Date: 2014-04-03, 23:23:00 GMT [RP, KW]
# Blocks-8.0.0.txt
# Date: 2014-11-10, 23:04:00 GMT [KW]
#
# Unicode Character Database
# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Note: The casing of block names is not normative.
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
#
# Format:
# Start Code..End Code; Block Name
@ -20,6 +17,14 @@
# For more information on the comparison of property values,
# see UAX #44: http://www.unicode.org/reports/tr44/
#
# All block ranges start with a value where (cp MOD 16) = 0,
# and end with a value where (cp MOD 16) = 15. In other words,
# the last hexadecimal digit of the start of range is ...0
# and the last hexadecimal digit of the end of range is ...F.
# This constraint on block ranges guarantees that allocations
# are done in terms of whole columns, and that code chart display
# never involves splitting columns in the charts.
#
# All code points not explicitly listed for Block
# have the value No_Block.
@ -168,6 +173,7 @@ AA80..AADF; Tai Viet
AAE0..AAFF; Meetei Mayek Extensions
AB00..AB2F; Ethiopic Extended-A
AB30..AB6F; Latin Extended-E
AB70..ABBF; Cherokee Supplement
ABC0..ABFF; Meetei Mayek
AC00..D7AF; Hangul Syllables
D7B0..D7FF; Hangul Jamo Extended-B
@ -210,6 +216,7 @@ FFF0..FFFF; Specials
10840..1085F; Imperial Aramaic
10860..1087F; Palmyrene
10880..108AF; Nabataean
108E0..108FF; Hatran
10900..1091F; Phoenician
10920..1093F; Lydian
10980..1099F; Meroitic Hieroglyphs
@ -223,6 +230,7 @@ FFF0..FFFF; Specials
10B60..10B7F; Inscriptional Pahlavi
10B80..10BAF; Psalter Pahlavi
10C00..10C4F; Old Turkic
10C80..10CFF; Old Hungarian
10E60..10E7F; Rumi Numeral Symbols
11000..1107F; Brahmi
11080..110CF; Kaithi
@ -232,17 +240,21 @@ FFF0..FFFF; Specials
11180..111DF; Sharada
111E0..111FF; Sinhala Archaic Numbers
11200..1124F; Khojki
11280..112AF; Multani
112B0..112FF; Khudawadi
11300..1137F; Grantha
11480..114DF; Tirhuta
11580..115FF; Siddham
11600..1165F; Modi
11680..116CF; Takri
11700..1173F; Ahom
118A0..118FF; Warang Citi
11AC0..11AFF; Pau Cin Hau
12000..123FF; Cuneiform
12400..1247F; Cuneiform Numbers and Punctuation
12480..1254F; Early Dynastic Cuneiform
13000..1342F; Egyptian Hieroglyphs
14400..1467F; Anatolian Hieroglyphs
16800..16A3F; Bamum Supplement
16A40..16A6F; Mro
16AD0..16AFF; Bassa Vah
@ -257,6 +269,7 @@ FFF0..FFFF; Specials
1D300..1D35F; Tai Xuan Jing Symbols
1D360..1D37F; Counting Rod Numerals
1D400..1D7FF; Mathematical Alphanumeric Symbols
1D800..1DAAF; Sutton SignWriting
1E800..1E8DF; Mende Kikakui
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
1F000..1F02F; Mahjong Tiles
@ -271,9 +284,11 @@ FFF0..FFFF; Specials
1F700..1F77F; Alchemical Symbols
1F780..1F7FF; Geometric Shapes Extended
1F800..1F8FF; Supplemental Arrows-C
1F900..1F9FF; Supplemental Symbols and Pictographs
20000..2A6DF; CJK Unified Ideographs Extension B
2A700..2B73F; CJK Unified Ideographs Extension C
2B740..2B81F; CJK Unified Ideographs Extension D
2B820..2CEAF; CJK Unified Ideographs Extension E
2F800..2FA1F; CJK Compatibility Ideographs Supplement
E0000..E007F; Tags
E0100..E01EF; Variation Selectors Supplement

File diff suppressed because it is too large Load Diff

94
unicode/how-to-update.txt Normal file
View File

@ -0,0 +1,94 @@
This document contains instructions for updating the Unicode data set used by
the WebEncoders project.
1) Download the latest UnicodeData.txt and Blocks.txt from the Unicode
Consortium web site. These files are normally found under
http://www.unicode.org/Public/X.Y.Z/ucd/, where X.Y.Z is the version of the
Unicode specification of interest. Replace the UnicodeData.txt and
Blocks.txt files in this folder with the files you downloaded.
2) Update unicode-copyright.txt in this folder with the following information:
- The exact URLs where you downloaded UnicodeData.txt and Blocks.txt.
- The date on which you downloaded these two files.
- The Unicode copyright and permission notice, if it has changed. The latest
copyright and permission notice can be found at the bottom of
http://www.unicode.org/copyright.html.
3) Open the Generators solution and run the DefinedCharListGenerator project.
Running this will drop a file unicode-defined-chars.bin into the output
folder. Move this file into the following directory, overwriting the
existing file in that directory:
src\Microsoft.Framework.WebEncoders.Core\compiler\resources
4) Open the Generators solution and run the UnicodeTablesGenerator project.
Running this will drop two files UnicodeRanges.generated.txt and
UnicodeRangesTests.generated.txt into the output folder.
5) Open UnicodeRanges.generated.txt in your favorite text editor. You'll see
that the file contains all of the parsed Unicode block information in
ascending code point order. Manually REMOVE the following blocks from this
text file and re-save it.
- High Surrogates (U+D800..U+DB7F)
- High Private Use Surrogates (U+DB80..U+DBFF)
- Low Surrogates (U+DC00..U+DFFF)
- Private Use Area (U+E000..U+F8FF)
6) Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.generated.cs in
your IDE. Delete everything within the partial class definition and replace
it with the contents of UnicodeRanges.generated.txt. (Remember to remove
the blocks mentioned in the previous step, otherwise unit tests will fail.)
Open src\Microsoft.Framework.WebEncoders.Core\UnicodeRanges.cs in your IDE.
Update the doc comment at the top of the class to reflect the appropriate
version of the Unicode specification.
7) Open UnicodeRangesTests.generated.txt in your favorite text editor. Just
like in the previous .txt file, you'll need to remove the [InlineData]
lines which map to the Unicode blocks which were manually removed.
See step (5) for the list of which blocks must be removed. Then re-save
this file.
8) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeRangesTests.cs in
your IDE. Delete all of the [InlineData] attributes on the Range_Unicode
test, then paste the contents of UnicodeRangesTests.generated.txt in
to restore the new [InlineData] list.
IMPORTANT: Don't delete the [Theory] attribute on this method!
9) Open test\Microsoft.Framework.WebEncoders.Tests\UnicodeHelpersTests.cs in
your IDE. Scroll to the bottom of the ReadListOfDefinedCharacters method,
and you'll see a section where the test special-cases CJK Ideographs and
Hangul Syllables. As more characters are added to the Unicode specification
the list of valid CJK Ideographs and Hangul Syllables can grow, so make sure
these match up with the relevant lines in UnicodeData.txt. For instance, at
the time of this writing UnicodeData.txt lists the valid Hangul Syllable
character range as follows:
AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
If necessary, update the logic in the ReadListOfDefinedCharacters method to
account for any changes to these lines in UnicodeData.txt.
That's it! Run the unit tests and everything should be good to go. If you find
any stray comments throughout the code base that reference a specific version
of the Unicode specification, go ahead and update them so that they correctly
reflect the version you just submitted.
To recap, the files you should check in are:
src\Microsoft.Framework.WebEncoders.Core\compiler\resources\
unicode-defined-chars.bin
src\Microsoft.Framework.WebEncoders.Core\
UnicodeRanges.cs
UnicodeRanges.generated.cs
test\Microsoft.Framework.WebEncoders.Tests\
UnicodeHelpersTests.cs (if necessary, see step 9)
UnicodeRangesTests.cs
unicode\
Blocks.txt
unicode-copyright.txt
UnicodeData.txt

View File

@ -1,8 +1,8 @@
The files Blocks.txt and UnicodeData.txt in this directory were
retrieved from the following URLs on Saturday, February 7, 2015.
retrieved from the following URLs on Saturday, September 5, 2015.
http://www.unicode.org/Public/7.0.0/ucd/Blocks.txt
http://www.unicode.org/Public/7.0.0/ucd/UnicodeData.txt
http://www.unicode.org/Public/8.0.0/ucd/Blocks.txt
http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
The below copyright notice applies to these files.