Added 38,642 missing characters to the Unicode database (first-last
ranges) -- but thanks to the 2.0 compression scheme, this doesn't add
a single byte to the resulting binaries (!)

Closes bug #117524
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 8c0c075..15841d7 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -9,6 +9,7 @@
 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
 # 2000-09-25 fl   added character type table
 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields
+# 2000-11-03 fl   expand first/last ranges
 #
 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
@@ -39,10 +40,13 @@
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 
-def maketables():
+def maketables(trace=0):
 
     unicode = UnicodeData(UNICODE_DATA)
 
+    print "--- Processing", UNICODE_DATA, "..."
+    print len(filter(None, unicode.table)), "characters"
+
     # extract unicode properties
     dummy = (0, 0, 0, 0)
     table = [dummy]
@@ -91,6 +95,11 @@
 
     FILE = "Modules/unicodedata_db.h"
 
+    print "--- Writing", FILE, "..."
+
+    print len(table), "unique properties"
+    print len(decomp_data), "unique decomposition entries"
+
     fp = open(FILE, "w")
     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
     print >>fp
@@ -125,7 +134,7 @@
     print >>fp, "};"
 
     # split record index table
-    index1, index2, shift = splitbins(index)
+    index1, index2, shift = splitbins(index, trace)
 
     print >>fp, "/* index tables for the database records */"
     print >>fp, "#define SHIFT", shift
@@ -133,7 +142,7 @@
     Array("index2", index2).dump(fp)
 
     # split decomposition index table
-    index1, index2, shift = splitbins(decomp_index)
+    index1, index2, shift = splitbins(decomp_index, trace)
 
     print >>fp, "/* index tables for the decomposition data */"
     print >>fp, "#define DECOMP_SHIFT", shift
@@ -200,12 +209,14 @@
                 table.append(item)
             index[char] = i
 
-    print len(table), "ctype entries"
-
     FILE = "Objects/unicodetype_db.h"
 
     fp = open(FILE, "w")
 
+    print "--- Writing", FILE, "..."
+
+    print len(table), "unique character type entries"
+
     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
     print >>fp
     print >>fp, "/* a list of unique character type descriptors */"
@@ -216,7 +227,7 @@
     print >>fp
 
     # split decomposition index table
-    index1, index2, shift = splitbins(index)
+    index1, index2, shift = splitbins(index, trace)
 
     print >>fp, "/* type indexes */"
     print >>fp, "#define SHIFT", shift
@@ -233,7 +244,7 @@
 
 class UnicodeData:
 
-    def __init__(self, filename):
+    def __init__(self, filename, expand=1):
         file = open(filename)
         table = [None] * 65536
         while 1:
@@ -244,6 +255,22 @@
             char = string.atoi(s[0], 16)
             table[char] = s
 
+        # expand first-last ranges (ignore surrogates and private use)
+        if expand:
+            field = None
+            for i in range(0, 0xD800):
+                s = table[i]
+                if s:
+                    if s[1][-6:] == "First>":
+                        s[1] = ""
+                        field = s[:]
+                    elif s[1][-5:] == "Last>":
+                        s[1] = ""
+                        field = None
+                elif field:
+                    field[0] = hex(i)
+                    table[i] = field
+
         # public attributes
         self.filename = filename
         self.table = table
@@ -306,8 +333,9 @@
         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
     where mask is a bitmask isolating the last "shift" bits.
 
-    If optional arg trace is true (default false), progress info is
-    printed to sys.stderr.
+    If optional arg trace is non-zero (default zero), progress info
+    is printed to sys.stderr.  The higher the value, the more info
+    you'll get.
     """
 
     import sys
@@ -341,7 +369,7 @@
             t1.append(index >> shift)
         # determine memory size
         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
-        if trace:
+        if trace > 1:
             dump(t1, t2, shift, b)
         if b < bytes:
             best = t1, t2, shift
@@ -358,4 +386,4 @@
     return best
 
 if __name__ == "__main__":
-    maketables()
+    maketables(1)