Upgrade to Unicode 6.0.0. makeunicodedata.py: download all data files from unicode.org, switch to extracting Unihan data from zip file. Read linebreakprops and derivednormalizationprops even for old versions, even though they are not used in delta records. test:unicode.py: U+11000 is now assigned, use U+14000 instead.

commit: baecd7243ad2d671fa6f675777dfe7094e8c5c39 [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Mon Oct 11 22:42:28 2010 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Mon Oct 11 22:42:28 2010 +0000
tree: a4e6b946feafacd5c5f1bccd8a30d69fa7dd6922
parent: e8930228c7482d39faf24f4a805c37ff7ca9223a [diff] [blame]
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index b2615ee..0783f17 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py

@@ -25,17 +25,17 @@
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
 
-import sys
+import sys, os, zipfile
 
 SCRIPT = sys.argv[0]
 VERSION = "3.2"
 
 # The Unicode Database
-UNIDATA_VERSION = "5.2.0"
+UNIDATA_VERSION = "6.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-UNIHAN = "Unihan%s.txt"
+UNIHAN = "Unihan%s.zip"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
@@ -75,23 +75,13 @@
     print("--- Reading", UNICODE_DATA % "", "...")
 
     version = ""
-    unicode = UnicodeData(UNICODE_DATA % version,
-                          COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version,
-                          UNIHAN % version,
-                          DERIVED_CORE_PROPERTIES % version,
-                          DERIVEDNORMALIZATION_PROPS % version,
-                          LINE_BREAK % version)
+    unicode = UnicodeData(UNIDATA_VERSION)
 
     print(len(list(filter(None, unicode.table))), "characters")
 
     for version in old_versions:
         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
-                                  COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version),
-                                  UNIHAN % ("-"+version),
-                                  DERIVED_CORE_PROPERTIES % ("-"+version))
+        old_unicode = UnicodeData(version)
         print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
 
@@ -771,6 +761,10 @@
                     elif k == 16:
                         # derived property changes; not yet
                         pass
+                    elif k == 17:
+                        # normalization quickchecks are not performed
+                        # for older versions
+                        pass
                     else:
                         class Difference(Exception):pass
                         raise Difference(hex(i), k, old.table[i], new.table[i])
@@ -779,6 +773,21 @@
                                      numeric_changes)),
                         normalization_changes))
 
+def open_data(template, version):
+    local = template % ('-'+version,)
+    if not os.path.exists(local):
+        import urllib.request
+        if version == '3.2.0':
+            # irregular url structure
+            url = 'http://www.unicode.org/Public/3.2-Update/' + local
+        else:
+            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
+        urllib.request.urlretrieve(url, filename=local)
+    if local.endswith('.txt'):
+        return open(local, encoding='utf-8')
+    else:
+        # Unihan.zip
+        return open(local, 'rb')
 
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
@@ -793,11 +802,11 @@
     #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
     #  derived-props] (17)
 
-    def __init__(self, filename, exclusions, eastasianwidth, unihan,
-                 derivedprops, derivednormalizationprops=None, linebreakprops=None,
+    def __init__(self, version,
+                 linebreakprops=False,
                  expand=1):
         self.changed = []
-        file = open(filename)
+        file = open_data(UNICODE_DATA, version)
         table = [None] * 0x110000
         while 1:
             s = file.readline()
@@ -825,11 +834,11 @@
                     table[i] = f2
 
         # public attributes
-        self.filename = filename
+        self.filename = UNICODE_DATA % ''
         self.table = table
         self.chars = list(range(0x110000)) # unicode 3.2
 
-        file = open(exclusions)
+        file = open_data(COMPOSITION_EXCLUSIONS, version)
         self.exclusions = {}
         for s in file:
             s = s.strip()
@@ -841,7 +850,7 @@
             self.exclusions[char] = 1
 
         widths = [None] * 0x110000
-        for s in open(eastasianwidth):
+        for s in open_data(EASTASIAN_WIDTH, version):
             s = s.strip()
             if not s:
                 continue
@@ -862,7 +871,7 @@
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(set())
-        for s in open(derivedprops):
+        for s in open_data(DERIVED_CORE_PROPERTIES, version):
             s = s.split('#', 1)[0].strip()
             if not s:
                 continue
@@ -881,43 +890,53 @@
                     # apply to unassigned code points; ignore them
                     table[char][-1].add(p)
 
-        if linebreakprops:
-            for s in open(linebreakprops):
-                s = s.partition('#')[0]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
-                    continue
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    table[char][-1].add('Line_Break')
+        for s in open_data(LINE_BREAK, version):
+            s = s.partition('#')[0]
+            s = [i.strip() for i in s.split(';')]
+            if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                continue
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                table[char][-1].add('Line_Break')
 
-        if derivednormalizationprops:
-            quickchecks = [0] * 0x110000 # default is Yes
-            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
-            for s in open(derivednormalizationprops):
-                if '#' in s:
-                    s = s[:s.index('#')]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in qc_order:
-                    continue
-                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
-                quickcheck_shift = qc_order.index(s[1])*2
-                quickcheck <<= quickcheck_shift
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    assert not (quickchecks[char]>>quickcheck_shift)&3
-                    quickchecks[char] |= quickcheck
-            for i in range(0, 0x110000):
-                if table[i] is not None:
-                    table[i].append(quickchecks[i])
+        # We only want the quickcheck properties
+        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
+        # Yes is the default, hence only N and M occur
+        # In 3.2.0, the format was different (NF?_NO)
+        # The parsing will incorrectly determine these as
+        # "yes", however, unicodedata.c will not perform quickchecks
+        # for older versions, and no delta records will be created.
+        quickchecks = [0] * 0x110000
+        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+        for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
+            if '#' in s:
+                s = s[:s.index('#')]
+            s = [i.strip() for i in s.split(';')]
+            if len(s) < 2 or s[1] not in qc_order:
+                continue
+            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+            quickcheck_shift = qc_order.index(s[1])*2
+            quickcheck <<= quickcheck_shift
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                assert not (quickchecks[char]>>quickcheck_shift)&3
+                quickchecks[char] |= quickcheck
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(quickchecks[i])
 
-        for line in open(unihan, encoding='utf-8'):
+        zip = zipfile.ZipFile(open_data(UNIHAN, version))
+        if version == '3.2.0':
+            data = zip.open('Unihan-3.2.0.txt').read()
+        else:
+            data = zip.open('Unihan_NumericValues.txt').read()
+        for line in data.decode("utf-8").splitlines():
             if not line.startswith('U+'):
                 continue
             code, tag, value = line.split(None, 3)[:3]
commit	baecd7243ad2d671fa6f675777dfe7094e8c5c39	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Mon Oct 11 22:42:28 2010 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Mon Oct 11 22:42:28 2010 +0000
tree	a4e6b946feafacd5c5f1bccd8a30d69fa7dd6922
parent	e8930228c7482d39faf24f4a805c37ff7ca9223a [diff] [blame]