- SF #962502: Add two more methods for unicode type; width() and iswide() for east asian width manipulation. (Inspired by David Goodger, Reviewed by Martin v. Loewis) - Move _PyUnicode_TypeRecord.flags to the end of the struct so that no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)

commit: 974ed7cfa50b666c9ab91f7a3f8f26049d387107 [log] [tgz]
author: Hye-Shik Chang <hyeshik@gmail.com> Wed Jun 02 16:49:17 2004 +0000
committer: Hye-Shik Chang <hyeshik@gmail.com> Wed Jun 02 16:49:17 2004 +0000
tree: d821c74c26231d988f34764d0fdfe3494036ee95
parent: b6568b91fdf7de1377dba395c6725a7307b818ee [diff] [blame]
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index c948312..6c29fd1 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py

@@ -18,6 +18,7 @@
 # 2002-10-22 mvl  generate NFC tables
 # 2002-11-24 mvl  expand all ranges, sort names version-independently
 # 2002-11-25 mvl  add UNIDATA_VERSION
+# 2004-05-29 perky add east asian width information
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@@ -25,12 +26,13 @@
 import sys
 
 SCRIPT = sys.argv[0]
-VERSION = "2.2"
+VERSION = "2.3"
 
 # The Unicode Database
 UNIDATA_VERSION = "3.2.0"
 UNICODE_DATA = "UnicodeData.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
+EASTASIAN_WIDTH = "EastAsianWidth.txt"
 
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -50,12 +52,14 @@
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
+WIDE_MASK = 0x100
 
 def maketables(trace=0):
 
     print "--- Reading", UNICODE_DATA, "..."
 
-    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
+    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
+                          EASTASIAN_WIDTH)
 
     print len(filter(None, unicode.table)), "characters"
 
@@ -330,8 +334,10 @@
             if record[7]:
                 flags |= DIGIT_MASK
                 digit = int(record[7])
+            if record[15] in ('W', 'F'): # Wide or Full width
+                flags |= WIDE_MASK
             item = (
-                flags, upper, lower, title, decimal, digit
+                upper, lower, title, decimal, digit, flags
                 )
             # add entry to index and item tables
             i = cache.get(item)
@@ -538,7 +544,7 @@
 
 class UnicodeData:
 
-    def __init__(self, filename, exclusions, expand=1):
+    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
         file = open(filename)
         table = [None] * 0x110000
         while 1:
@@ -581,6 +587,25 @@
             char = int(s.split()[0],16)
             self.exclusions[char] = 1
 
+        widths = [None] * 0x110000
+        for s in open(eastasianwidth):
+            s = s.strip()
+            if not s:
+                continue
+            if s[0] == '#':
+                continue
+            s = s.split()[0].split(';')
+            if '..' in s[0]:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+                chars = range(first, last+1)
+            else:
+                chars = [int(s[0], 16)]
+            for char in chars:
+                widths[char] = s[1]
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                table[i].append(widths[i])
+
     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = range(256)
commit	974ed7cfa50b666c9ab91f7a3f8f26049d387107	[log] [tgz]
author	Hye-Shik Chang <hyeshik@gmail.com>	Wed Jun 02 16:49:17 2004 +0000
committer	Hye-Shik Chang <hyeshik@gmail.com>	Wed Jun 02 16:49:17 2004 +0000
tree	d821c74c26231d988f34764d0fdfe3494036ee95
parent	b6568b91fdf7de1377dba395c6725a7307b818ee [diff] [blame]