Merged revisions 75272-75273 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r75272 | amaury.forgeotdarc | 2009-10-06 21:56:32 +0200 (mar., 06 oct. 2009) | 5 lines #1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric, _PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace. It now also parses the Unihan.txt for numeric values. ........ r75273 | amaury.forgeotdarc | 2009-10-06 22:02:09 +0200 (mar., 06 oct. 2009) | 2 lines Add Anders Chrigstrom to Misc/ACKS for his work on unicodedata. ........

commit: 7d52079395263fa7a9e8b82da86d0c595ad71859 [log] [tgz]
author: Amaury Forgeot d'Arc <amauryfa@gmail.com> Tue Oct 06 21:03:20 2009 +0000
committer: Amaury Forgeot d'Arc <amauryfa@gmail.com> Tue Oct 06 21:03:20 2009 +0000
tree: 0cf856947b09262d6f3591572343ec89649da3b9
parent: e1b60d4849111f5e1bacb1cf511ec7e7e97c205b [diff] [blame]
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 52cb365..439a45b 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py

@@ -35,6 +35,7 @@
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+UNIHAN = "Unihan%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 
@@ -64,6 +65,7 @@
 XID_CONTINUE_MASK = 0x200
 PRINTABLE_MASK = 0x400
 NODELTA_MASK = 0x800
+NUMERIC_MASK = 0x1000
 
 def maketables(trace=0):
 
@@ -73,6 +75,7 @@
     unicode = UnicodeData(UNICODE_DATA % version,
                           COMPOSITION_EXCLUSIONS % version,
                           EASTASIAN_WIDTH % version,
+                          UNIHAN % version,
                           DERIVED_CORE_PROPERTIES % version,
                           DERIVEDNORMALIZATION_PROPS % version)
 
@@ -83,6 +86,7 @@
         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                   COMPOSITION_EXCLUSIONS % ("-"+version),
                                   EASTASIAN_WIDTH % ("-"+version),
+                                  UNIHAN % ("-"+version),
                                   DERIVED_CORE_PROPERTIES % ("-"+version))
         print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
@@ -357,6 +361,9 @@
     table = [dummy]
     cache = {0: dummy}
     index = [0] * len(unicode.chars)
+    numeric = {}
+    spaces = []
+    linebreaks = []
 
     for char in unicode.chars:
         record = unicode.table[char]
@@ -373,8 +380,10 @@
                 flags |= LOWER_MASK
             if category == "Zl" or bidirectional == "B":
                 flags |= LINEBREAK_MASK
+                linebreaks.append(char)
             if category == "Zs" or bidirectional in ("WS", "B", "S"):
                 flags |= SPACE_MASK
+                spaces.append(char)
             if category == "Lt":
                 flags |= TITLE_MASK
             if category == "Lu":
@@ -423,6 +432,9 @@
             if record[7]:
                 flags |= DIGIT_MASK
                 digit = int(record[7])
+            if record[8]:
+                flags |= NUMERIC_MASK
+                numeric.setdefault(record[8], []).append(char)
             item = (
                 upper, lower, title, decimal, digit, flags
                 )
@@ -434,6 +446,9 @@
             index[char] = i
 
     print(len(table), "unique character type entries")
+    print(sum(map(len, numeric.values())), "numeric code points")
+    print(len(spaces), "whitespace code points")
+    print(len(linebreaks), "linebreak code points")
 
     print("--- Writing", FILE, "...")
 
@@ -455,6 +470,96 @@
     Array("index1", index1).dump(fp, trace)
     Array("index2", index2).dump(fp, trace)
 
+    # Generate code for _PyUnicode_ToNumeric()
+    numeric_items = sorted(numeric.items())
+    print('/* Returns the numeric value as double for Unicode characters', file=fp)
+    print(' * having this property, -1.0 otherwise.', file=fp)
+    print(' */', file=fp)
+    print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
+    print('{', file=fp)
+    print('    switch (ch) {', file=fp)
+    for value, codepoints in numeric_items:
+        haswide = False
+        hasnonewide = False
+        codepoints.sort()
+        for codepoint in codepoints:
+            if codepoint < 0x10000:
+                hasnonewide = True
+            if codepoint >= 0x10000 and not haswide:
+                print('#ifdef Py_UNICODE_WIDE', file=fp)
+                haswide = True
+            print('    case 0x%04X:' % (codepoint,), file=fp)
+        if haswide and hasnonewide:
+            print('#endif', file=fp)
+        print('        return (double) %s;' % (value,), file=fp)
+        if haswide and not hasnonewide:
+            print('#endif', file=fp)
+    print('    }', file=fp)
+    print('    return -1.0;', file=fp)
+    print('}', file=fp)
+    print(file=fp)
+
+    # Generate code for _PyUnicode_IsWhitespace()
+    print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
+    print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
+    print(" */", file=fp)
+    print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
+    print('{', file=fp)
+    print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
+    print('    return iswspace(ch);', file=fp)
+    print('#else', file=fp)
+    print('    switch (ch) {', file=fp)
+
+    haswide = False
+    hasnonewide = False
+    spaces.sort()
+    for codepoint in spaces:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print('#ifdef Py_UNICODE_WIDE', file=fp)
+            haswide = True
+        print('    case 0x%04X:' % (codepoint,), file=fp)
+    if haswide and hasnonewide:
+        print('#endif', file=fp)
+    print('        return 1;', file=fp)
+    if haswide and not hasnonewide:
+        print('#endif', file=fp)
+
+    print('    }', file=fp)
+    print('    return 0;', file=fp)
+    print('#endif', file=fp)
+    print('}', file=fp)
+    print(file=fp)
+
+    # Generate code for _PyUnicode_IsLinebreak()
+    print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
+    print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
+    print(" */", file=fp)
+    print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
+    print('{', file=fp)
+    print('    switch (ch) {', file=fp)
+    haswide = False
+    hasnonewide = False
+    linebreaks.sort()
+    for codepoint in linebreaks:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print('#ifdef Py_UNICODE_WIDE', file=fp)
+            haswide = True
+        print('    case 0x%04X:' % (codepoint,), file=fp)
+    if haswide and hasnonewide:
+        print('#endif', file=fp)
+    print('        return 1;', file=fp)
+    if haswide and not hasnonewide:
+        print('#endif', file=fp)
+
+    print('    }', file=fp)
+    print('    return 0;', file=fp)
+    print('}', file=fp)
+    print(file=fp)
+
     fp.close()
 
 # --------------------------------------------------------------------
@@ -670,12 +775,11 @@
                     elif k == 8:
                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
                         # Since 0 encodes "no change", the old value is better not 0
-                        assert value != "0" and value != "-1"
                         if not value:
                             numeric_changes[i] = -1
                         else:
-                            assert re.match("^[0-9]+$", value)
-                            numeric_changes[i] = int(value)
+                            numeric_changes[i] = float(value)
+                            assert numeric_changes[i] not in (0, -1)
                     elif k == 9:
                         if value == 'Y':
                             mirrored_changes[i] = '1'
@@ -711,8 +815,6 @@
 
 # load a unicode-data file from disk
 
-import sys
-
 class UnicodeData:
     # Record structure:
     # [ID, name, category, combining, bidi, decomp,  (6)
@@ -720,7 +822,7 @@
     #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
     #  derived-props] (17)
 
-    def __init__(self, filename, exclusions, eastasianwidth,
+    def __init__(self, filename, exclusions, eastasianwidth, unihan,
                  derivedprops, derivednormalizationprops=None, expand=1):
         self.changed = []
         file = open(filename)
@@ -830,6 +932,19 @@
                 if table[i] is not None:
                     table[i].append(quickchecks[i])
 
+        for line in open(unihan, encoding='utf-8'):
+            if not line.startswith('U+'):
+                continue
+            code, tag, value = line.split(None, 3)[:3]
+            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
+                           'kOtherNumeric'):
+                continue
+            value = value.strip().replace(',', '')
+            i = int(code[2:], 16)
+            # Patch the numeric field
+            if table[i] is not None:
+                table[i][8] = value
+
     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = list(range(256))
@@ -979,7 +1094,6 @@
     you'll get.
     """
 
-    import sys
     if trace:
         def dump(t1, t2, shift, bytes):
             print("%d+%d bins at shift %d; %d bytes" % (
commit	7d52079395263fa7a9e8b82da86d0c595ad71859	[log] [tgz]
author	Amaury Forgeot d'Arc <amauryfa@gmail.com>	Tue Oct 06 21:03:20 2009 +0000
committer	Amaury Forgeot d'Arc <amauryfa@gmail.com>	Tue Oct 06 21:03:20 2009 +0000
tree	0cf856947b09262d6f3591572343ec89649da3b9
parent	e1b60d4849111f5e1bacb1cf511ec7e7e97c205b [diff] [blame]