Add XID_Start and XID_Continue properties to unicodectype.

commit: 13c3e380d1ff807b1a18934ac9aace037c66f2ac [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Tue Aug 14 22:37:03 2007 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Tue Aug 14 22:37:03 2007 +0000
tree: 2d8781b4eb3ba3fea04f133e7e512c50dafc8e82
parent: ff398c6f957fcd0e55aa57c0eaa5c1d24c5bc2f1 [diff] [blame]
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 0aabdf7..ab08887 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py

@@ -34,6 +34,7 @@
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 
 old_versions = ["3.2.0"]
 
@@ -57,6 +58,8 @@
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
+XID_START_MASK = 0x100
+XID_CONTINUE_MASK = 0x200
 
 def maketables(trace=0):
 
@@ -65,16 +68,18 @@
     version = ""
     unicode = UnicodeData(UNICODE_DATA % version,
                           COMPOSITION_EXCLUSIONS % version,
-                          EASTASIAN_WIDTH % version)
+                          EASTASIAN_WIDTH % version,
+                          DERIVED_CORE_PROPERTIES % version)
 
-    print(len(filter(None, unicode.table)), "characters")
+    print(len(list(filter(None, unicode.table))), "characters")
 
     for version in old_versions:
         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                   COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version))
-        print(len(filter(None, old_unicode.table)), "characters")
+                                  EASTASIAN_WIDTH % ("-"+version),
+                                  DERIVED_CORE_PROPERTIES % ("-"+version))
+        print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
 
     makeunicodename(unicode, trace)
@@ -148,7 +153,7 @@
                 assert prefix < 256
                 # content
                 decomp = [prefix + (len(decomp)<<8)] +\
-                         map(lambda s: int(s, 16), decomp)
+                         list(map(lambda s: int(s, 16), decomp))
                 # Collect NFC pairs
                 if not prefix and len(decomp) == 3 and \
                    char not in unicode.exclusions and \
@@ -353,6 +358,7 @@
             # extract database properties
             category = record[2]
             bidirectional = record[4]
+            properties = record[16]
             flags = 0
             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                 flags |= ALPHA_MASK
@@ -366,6 +372,10 @@
                 flags |= TITLE_MASK
             if category == "Lu":
                 flags |= UPPER_MASK
+            if "XID_Start" in properties:
+                flags |= XID_START_MASK
+            if "XID_Continue" in properties:
+                flags |= XID_CONTINUE_MASK
             # use delta predictor for upper/lower/title
             if record[12]:
                 upper = int(record[12], 16) - char
@@ -447,7 +457,7 @@
             if name and name[0] != "<":
                 names[char] = name + chr(0)
 
-    print(len(filter(lambda n: n is not None, names)), "distinct names")
+    print(len(list(filter(lambda n: n is not None, names))), "distinct names")
 
     # collect unique words from names (note that we differ between
     # words inside a sentence, and words ending a sentence.  the
@@ -470,10 +480,12 @@
 
     print(n, "words in text;", b, "bytes")
 
-    wordlist = words.items()
+    wordlist = list(words.items())
 
     # sort on falling frequency, then by name
-    def cmpwords((aword, alist),(bword, blist)):
+    def cmpwords(a,b):
+        aword, alist = a
+        bword, blist = b
         r = -cmp(len(alist),len(blist))
         if r:
             return r
@@ -526,7 +538,7 @@
         words[w] = len(lexicon_offset)
         lexicon_offset.append(o)
 
-    lexicon = map(ord, lexicon)
+    lexicon = list(map(ord, lexicon))
 
     # generate phrasebook from names and lexicon
     phrasebook = [0]
@@ -660,11 +672,14 @@
                     elif k == 14:
                         # change to simple titlecase mapping; ignore
                         pass
+                    elif k == 16:
+                        # derived property changes; not yet
+                        pass
                     else:
                         class Difference(Exception):pass
                         raise Difference, (hex(i), k, old.table[i], new.table[i])
-    new.changed.append((version, zip(bidir_changes, category_changes,
-                                     decimal_changes, numeric_changes),
+    new.changed.append((version, list(zip(bidir_changes, category_changes,
+                                     decimal_changes, numeric_changes)),
                         normalization_changes))
 
 
@@ -677,8 +692,14 @@
 import sys
 
 class UnicodeData:
+    # Record structure:
+    # [ID, name, category, combining, bidi, decomp,  (6)
+    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
+    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
+    #  derived-props] (17)
 
-    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+    def __init__(self, filename, exclusions, eastasianwidth,
+                 derivedprops, expand=1):
         self.changed = []
         file = open(filename)
         table = [None] * 0x110000
@@ -742,6 +763,28 @@
             if table[i] is not None:
                 table[i].append(widths[i])
 
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(set())
+        for s in open(derivedprops):
+            s = s.split('#', 1)[0].strip()
+            if not s:
+                continue
+
+            r, p = s.split(";")
+            r = r.strip()
+            p = p.strip()
+            if ".." in r:
+                first, last = [int(c, 16) for c in r.split('..')]
+                chars = range(first, last+1)
+            else:
+                chars = [int(r, 16)]
+            for char in chars:
+                if table[char]:
+                    # Some properties (e.g. Default_Ignorable_Code_Point)
+                    # apply to unassigned code points; ignore them
+                    table[char][-1].add(p)
+
     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = range(256)
commit	13c3e380d1ff807b1a18934ac9aace037c66f2ac	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Tue Aug 14 22:37:03 2007 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Tue Aug 14 22:37:03 2007 +0000
tree	2d8781b4eb3ba3fea04f133e7e512c50dafc8e82
parent	ff398c6f957fcd0e55aa57c0eaa5c1d24c5bc2f1 [diff] [blame]