blob: 8ede83c71cd281b8ed23f5c9b6a984abfa6c7633 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis24329ba2008-09-10 13:38:12 +000033UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000049EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000051# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000052ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000056LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000057SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000060NODELTA_MASK = 0x100
Fredrik Lundhe9133f72000-09-25 17:59:57 +000061
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000062def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000064 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000065
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066 version = ""
67 unicode = UnicodeData(UNICODE_DATA % version,
68 COMPOSITION_EXCLUSIONS % version,
69 EASTASIAN_WIDTH % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000070
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000071 print len(filter(None, unicode.table)), "characters"
72
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000073 for version in old_versions:
74 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
75 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
76 COMPOSITION_EXCLUSIONS % ("-"+version),
77 EASTASIAN_WIDTH % ("-"+version))
78 print len(filter(None, old_unicode.table)), "characters"
79 merge_old_version(version, unicode, old_unicode)
80
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000081 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000082 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000083 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000084
85# --------------------------------------------------------------------
86# unicode character properties
87
88def makeunicodedata(unicode, trace):
89
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000090 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000091 table = [dummy]
92 cache = {0: dummy}
93 index = [0] * len(unicode.chars)
94
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000095 FILE = "Modules/unicodedata_db.h"
96
97 print "--- Preparing", FILE, "..."
98
Fredrik Lundhcfcea492000-09-25 08:07:06 +000099 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000100
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000101 for char in unicode.chars:
102 record = unicode.table[char]
103 if record:
104 # extract database properties
105 category = CATEGORY_NAMES.index(record[2])
106 combining = int(record[3])
107 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
108 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000109 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000111 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000112 )
113 # add entry to index and item tables
114 i = cache.get(item)
115 if i is None:
116 cache[item] = i = len(table)
117 table.append(item)
118 index[char] = i
119
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000120 # 2) decomposition data
121
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000122 decomp_data = [0]
123 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000125 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000126
Martin v. Löwis677bde22002-11-23 22:08:15 +0000127 comp_pairs = []
128 comp_first = [None] * len(unicode.chars)
129 comp_last = [None] * len(unicode.chars)
130
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 for char in unicode.chars:
132 record = unicode.table[char]
133 if record:
134 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000135 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000136 if len(decomp) > 19:
137 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000138 # prefix
139 if decomp[0][0] == "<":
140 prefix = decomp.pop(0)
141 else:
142 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 i = decomp_prefix.index(prefix)
145 except ValueError:
146 i = len(decomp_prefix)
147 decomp_prefix.append(prefix)
148 prefix = i
149 assert prefix < 256
150 # content
151 decomp = [prefix + (len(decomp)<<8)] +\
152 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000153 # Collect NFC pairs
154 if not prefix and len(decomp) == 3 and \
155 char not in unicode.exclusions and \
156 unicode.table[decomp[1]][3] == "0":
157 p, l, r = decomp
158 comp_first[l] = 1
159 comp_last[r] = 1
160 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000161 try:
162 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000163 except ValueError:
164 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_data.extend(decomp)
166 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 else:
168 i = 0
169 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000170
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 f = l = 0
172 comp_first_ranges = []
173 comp_last_ranges = []
174 prev_f = prev_l = None
175 for i in unicode.chars:
176 if comp_first[i] is not None:
177 comp_first[i] = f
178 f += 1
179 if prev_f is None:
180 prev_f = (i,i)
181 elif prev_f[1]+1 == i:
182 prev_f = prev_f[0],i
183 else:
184 comp_first_ranges.append(prev_f)
185 prev_f = (i,i)
186 if comp_last[i] is not None:
187 comp_last[i] = l
188 l += 1
189 if prev_l is None:
190 prev_l = (i,i)
191 elif prev_l[1]+1 == i:
192 prev_l = prev_l[0],i
193 else:
194 comp_last_ranges.append(prev_l)
195 prev_l = (i,i)
196 comp_first_ranges.append(prev_f)
197 comp_last_ranges.append(prev_l)
198 total_first = f
199 total_last = l
200
201 comp_data = [0]*(total_first*total_last)
202 for f,l,char in comp_pairs:
203 f = comp_first[f]
204 l = comp_last[l]
205 comp_data[f*total_last+l] = char
206
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000207 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 print len(decomp_prefix), "unique decomposition prefixes"
209 print len(decomp_data), "unique decomposition entries:",
210 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000211 print total_first, "first characters in NFC"
212 print total_last, "last characters in NFC"
213 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000214
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000215 print "--- Writing", FILE, "..."
216
Fred Drake9c685052000-10-26 03:56:46 +0000217 fp = open(FILE, "w")
218 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
219 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000220 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000221 print >>fp, "/* a list of unique database records */"
222 print >>fp, \
223 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000224 for item in table:
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000225 print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000226 print >>fp, "};"
227 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228
Martin v. Löwis677bde22002-11-23 22:08:15 +0000229 print >>fp, "/* Reindexing of NFC first characters. */"
230 print >>fp, "#define TOTAL_FIRST",total_first
231 print >>fp, "#define TOTAL_LAST",total_last
232 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000233 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000234 for start,end in comp_first_ranges:
235 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
236 print >>fp," {0,0,0}"
237 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000238 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000239 for start,end in comp_last_ranges:
240 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
241 print >>fp," {0,0,0}"
242 print >>fp,"};\n"
243
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000244 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000245 # the support code moved into unicodedatabase.c
246
Fred Drake9c685052000-10-26 03:56:46 +0000247 print >>fp, "/* string literals */"
248 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000249 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000250 print >>fp, " \"%s\"," % name
251 print >>fp, " NULL"
252 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253
Fred Drake9c685052000-10-26 03:56:46 +0000254 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000255 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000256 print >>fp, " \"%s\"," % name
257 print >>fp, " NULL"
258 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000259
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000260 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
261 for name in EASTASIANWIDTH_NAMES:
262 print >>fp, " \"%s\"," % name
263 print >>fp, " NULL"
264 print >>fp, "};"
265
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000266 print >>fp, "static const char *decomp_prefix[] = {"
267 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000268 print >>fp, " \"%s\"," % name
269 print >>fp, " NULL"
270 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000271
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000272 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000273 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000274
Fred Drake9c685052000-10-26 03:56:46 +0000275 print >>fp, "/* index tables for the database records */"
276 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000277 Array("index1", index1).dump(fp, trace)
278 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000279
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000280 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000281 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000282
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 print >>fp, "/* decomposition data */"
284 Array("decomp_data", decomp_data).dump(fp, trace)
285
Fred Drake9c685052000-10-26 03:56:46 +0000286 print >>fp, "/* index tables for the decomposition data */"
287 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 Array("decomp_index1", index1).dump(fp, trace)
289 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Martin v. Löwis677bde22002-11-23 22:08:15 +0000291 index, index2, shift = splitbins(comp_data, trace)
292 print >>fp, "/* NFC pairs */"
293 print >>fp, "#define COMP_SHIFT", shift
294 Array("comp_index", index).dump(fp, trace)
295 Array("comp_data", index2).dump(fp, trace)
296
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000297 # Generate delta tables for old versions
298 for version, table, normalization in unicode.changed:
299 cversion = version.replace(".","_")
300 records = [table[0]]
301 cache = {table[0]:0}
302 index = [0] * len(table)
303 for i, record in enumerate(table):
304 try:
305 index[i] = cache[record]
306 except KeyError:
307 index[i] = cache[record] = len(records)
308 records.append(record)
309 index1, index2, shift = splitbins(index, trace)
310 print >>fp, "static const change_record change_records_%s[] = {" % cversion
311 for record in records:
312 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
313 print >>fp, "};"
314 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
315 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
316 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
317 print >>fp, "{"
318 print >>fp, "\tint index;"
319 print >>fp, "\tif (n >= 0x110000) index = 0;"
320 print >>fp, "\telse {"
321 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
322 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
323 (cversion, shift, ((1<<shift)-1))
324 print >>fp, "\t}"
325 print >>fp, "\treturn change_records_%s+index;" % cversion
326 print >>fp, "}\n"
327 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
328 print >>fp, "{"
329 print >>fp, "\tswitch(n) {"
330 for k, v in normalization:
331 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
332 print >>fp, "\tdefault: return 0;"
333 print >>fp, "\t}\n}\n"
334
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000335 fp.close()
336
337# --------------------------------------------------------------------
338# unicode character type tables
339
340def makeunicodetype(unicode, trace):
341
342 FILE = "Objects/unicodetype_db.h"
343
344 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000345
346 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000347 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000348 table = [dummy]
349 cache = {0: dummy}
350 index = [0] * len(unicode.chars)
351
352 for char in unicode.chars:
353 record = unicode.table[char]
354 if record:
355 # extract database properties
356 category = record[2]
357 bidirectional = record[4]
358 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000359 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000360 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
361 flags |= ALPHA_MASK
362 if category == "Ll":
363 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000364 if category == "Zl" or bidirectional == "B":
365 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 if category == "Zs" or bidirectional in ("WS", "B", "S"):
367 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000368 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000369 flags |= TITLE_MASK
370 if category == "Lu":
371 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000372 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000374 upper = int(record[12], 16) - char
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000375 if -32768 <= upper <= 32767 and delta:
376 upper = upper & 0xffff
377 else:
378 upper += char
379 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000380 else:
381 upper = 0
382 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000383 lower = int(record[13], 16) - char
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000384 if -32768 <= lower <= 32767 and delta:
385 lower = lower & 0xffff
386 else:
387 lower += char
388 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 else:
390 lower = 0
391 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000392 title = int(record[14], 16) - char
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000393 if -32768 <= lower <= 32767 and delta:
394 title = title & 0xffff
395 else:
396 title += char
397 delta = False
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 else:
399 title = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000400 if not delta:
401 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000402 # decimal digit, integer digit
403 decimal = 0
404 if record[6]:
405 flags |= DECIMAL_MASK
406 decimal = int(record[6])
407 digit = 0
408 if record[7]:
409 flags |= DIGIT_MASK
410 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000412 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000413 )
414 # add entry to index and item tables
415 i = cache.get(item)
416 if i is None:
417 cache[item] = i = len(table)
418 table.append(item)
419 index[char] = i
420
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000421 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000423 print "--- Writing", FILE, "..."
424
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000425 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000426 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
427 print >>fp
428 print >>fp, "/* a list of unique character type descriptors */"
429 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000430 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000431 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
432 print >>fp, "};"
433 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000434
435 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000436 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000437
Fred Drake9c685052000-10-26 03:56:46 +0000438 print >>fp, "/* type indexes */"
439 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000440 Array("index1", index1).dump(fp, trace)
441 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000443 fp.close()
444
445# --------------------------------------------------------------------
446# unicode name database
447
448def makeunicodename(unicode, trace):
449
450 FILE = "Modules/unicodename_db.h"
451
452 print "--- Preparing", FILE, "..."
453
454 # collect names
455 names = [None] * len(unicode.chars)
456
457 for char in unicode.chars:
458 record = unicode.table[char]
459 if record:
460 name = record[1].strip()
461 if name and name[0] != "<":
462 names[char] = name + chr(0)
463
464 print len(filter(lambda n: n is not None, names)), "distinct names"
465
466 # collect unique words from names (note that we differ between
467 # words inside a sentence, and words ending a sentence. the
468 # latter includes the trailing null byte.
469
470 words = {}
471 n = b = 0
472 for char in unicode.chars:
473 name = names[char]
474 if name:
475 w = name.split()
476 b = b + len(name)
477 n = n + len(w)
478 for w in w:
479 l = words.get(w)
480 if l:
481 l.append(None)
482 else:
483 words[w] = [len(words)]
484
485 print n, "words in text;", b, "bytes"
486
487 wordlist = words.items()
488
Martin v. Löwis97225da2002-11-24 23:05:09 +0000489 # sort on falling frequency, then by name
490 def cmpwords((aword, alist),(bword, blist)):
491 r = -cmp(len(alist),len(blist))
492 if r:
493 return r
494 return cmp(aword, bword)
495 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000496
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000497 # figure out how many phrasebook escapes we need
498 escapes = 0
499 while escapes * 256 < len(wordlist):
500 escapes = escapes + 1
501 print escapes, "escapes"
502
503 short = 256 - escapes
504
505 assert short > 0
506
507 print short, "short indexes in lexicon"
508
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000509 # statistics
510 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000511 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000512 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000513 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000514
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000515 # pick the most commonly used words, and sort the rest on falling
516 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000517
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000518 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000519 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
520 wordlist.extend(wordtail)
521
522 # generate lexicon from words
523
524 lexicon_offset = [0]
525 lexicon = ""
526 words = {}
527
528 # build a lexicon string
529 offset = 0
530 for w, x in wordlist:
531 # encoding: bit 7 indicates last character in word (chr(128)
532 # indicates the last character in an entire string)
533 ww = w[:-1] + chr(ord(w[-1])+128)
534 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000535 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000536 if o < 0:
537 o = offset
538 lexicon = lexicon + ww
539 offset = offset + len(w)
540 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000541 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000542
543 lexicon = map(ord, lexicon)
544
545 # generate phrasebook from names and lexicon
546 phrasebook = [0]
547 phrasebook_offset = [0] * len(unicode.chars)
548 for char in unicode.chars:
549 name = names[char]
550 if name:
551 w = name.split()
552 phrasebook_offset[char] = len(phrasebook)
553 for w in w:
554 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000555 if i < short:
556 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000557 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000558 # store as two bytes
559 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000560 phrasebook.append(i&255)
561
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000562 assert getsize(phrasebook) == 1
563
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000564 #
565 # unicode name hash table
566
567 # extract names
568 data = []
569 for char in unicode.chars:
570 record = unicode.table[char]
571 if record:
572 name = record[1].strip()
573 if name and name[0] != "<":
574 data.append((name, char))
575
576 # the magic number 47 was chosen to minimize the number of
577 # collisions on the current data set. if you like, change it
578 # and see what happens...
579
580 codehash = Hash("code", data, 47)
581
582 print "--- Writing", FILE, "..."
583
584 fp = open(FILE, "w")
585 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
586 print >>fp
587 print >>fp, "#define NAME_MAXLEN", 256
588 print >>fp
589 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590 Array("lexicon", lexicon).dump(fp, trace)
591 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592
593 # split decomposition index table
594 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
595
596 print >>fp, "/* code->name phrasebook */"
597 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000598 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000599
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000600 Array("phrasebook", phrasebook).dump(fp, trace)
601 Array("phrasebook_offset1", offset1).dump(fp, trace)
602 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000603
604 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000605 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
607 fp.close()
608
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000609
610def merge_old_version(version, new, old):
611 # Changes to exclusion file not implemented yet
612 if old.exclusions != new.exclusions:
613 raise NotImplementedError, "exclusions differ"
614
615 # In these change records, 0xFF means "no change"
616 bidir_changes = [0xFF]*0x110000
617 category_changes = [0xFF]*0x110000
618 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000619 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000620 # In numeric data, 0 means "no change",
621 # -1 means "did not have a numeric value
622 numeric_changes = [0] * 0x110000
623 # normalization_changes is a list of key-value pairs
624 normalization_changes = []
625 for i in range(0x110000):
626 if new.table[i] is None:
627 # Characters unassigned in the new version ought to
628 # be unassigned in the old one
629 assert old.table[i] is None
630 continue
631 # check characters unassigned in the old version
632 if old.table[i] is None:
633 # category 0 is "unassigned"
634 category_changes[i] = 0
635 continue
636 # check characters that differ
637 if old.table[i] != new.table[i]:
638 for k in range(len(old.table[i])):
639 if old.table[i][k] != new.table[i][k]:
640 value = old.table[i][k]
641 if k == 2:
642 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
643 category_changes[i] = CATEGORY_NAMES.index(value)
644 elif k == 4:
645 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
646 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
647 elif k == 5:
648 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
649 # We assume that all normalization changes are in 1:1 mappings
650 assert " " not in value
651 normalization_changes.append((i, value))
652 elif k == 6:
653 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
654 # we only support changes where the old value is a single digit
655 assert value in "0123456789"
656 decimal_changes[i] = int(value)
657 elif k == 8:
658 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
659 # Since 0 encodes "no change", the old value is better not 0
660 assert value != "0" and value != "-1"
661 if not value:
662 numeric_changes[i] = -1
663 else:
664 assert re.match("^[0-9]+$", value)
665 numeric_changes[i] = int(value)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000666 elif k == 9:
667 if value == 'Y':
668 mirrored_changes[i] = '1'
669 else:
670 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000671 elif k == 11:
672 # change to ISO comment, ignore
673 pass
674 elif k == 12:
675 # change to simple uppercase mapping; ignore
676 pass
677 elif k == 13:
678 # change to simple lowercase mapping; ignore
679 pass
680 elif k == 14:
681 # change to simple titlecase mapping; ignore
682 pass
683 else:
684 class Difference(Exception):pass
685 raise Difference, (hex(i), k, old.table[i], new.table[i])
686 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000687 decimal_changes, mirrored_changes,
688 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000689 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000690
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000691
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000692# --------------------------------------------------------------------
693# the following support code is taken from the unidb utilities
694# Copyright (c) 1999-2000 by Secret Labs AB
695
696# load a unicode-data file from disk
697
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000698import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000699
700class UnicodeData:
701
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000702 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000703 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000704 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000705 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000706 while 1:
707 s = file.readline()
708 if not s:
709 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000710 s = s.strip().split(";")
711 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000712 table[char] = s
713
Martin v. Löwis97225da2002-11-24 23:05:09 +0000714 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000715 if expand:
716 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000717 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000718 s = table[i]
719 if s:
720 if s[1][-6:] == "First>":
721 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000722 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000723 elif s[1][-5:] == "Last>":
724 s[1] = ""
725 field = None
726 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000727 f2 = field[:]
728 f2[0] = "%X" % i
729 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000730
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000731 # public attributes
732 self.filename = filename
733 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000734 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000735
Martin v. Löwis677bde22002-11-23 22:08:15 +0000736 file = open(exclusions)
737 self.exclusions = {}
738 for s in file:
739 s = s.strip()
740 if not s:
741 continue
742 if s[0] == '#':
743 continue
744 char = int(s.split()[0],16)
745 self.exclusions[char] = 1
746
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000747 widths = [None] * 0x110000
748 for s in open(eastasianwidth):
749 s = s.strip()
750 if not s:
751 continue
752 if s[0] == '#':
753 continue
754 s = s.split()[0].split(';')
755 if '..' in s[0]:
756 first, last = [int(c, 16) for c in s[0].split('..')]
757 chars = range(first, last+1)
758 else:
759 chars = [int(s[0], 16)]
760 for char in chars:
761 widths[char] = s[1]
762 for i in range(0, 0x110000):
763 if table[i] is not None:
764 table[i].append(widths[i])
765
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000766 def uselatin1(self):
767 # restrict character range to ISO Latin 1
768 self.chars = range(256)
769
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000770# hash table tools
771
772# this is a straight-forward reimplementation of Python's built-in
773# dictionary type, using a static data structure, and a custom string
774# hash algorithm.
775
776def myhash(s, magic):
777 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000778 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000779 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000780 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000781 if ix:
782 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
783 return h
784
785SIZES = [
786 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
787 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
788 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
789 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
790]
791
792class Hash:
793 def __init__(self, name, data, magic):
794 # turn a (key, value) list into a static hash table structure
795
796 # determine table size
797 for size, poly in SIZES:
798 if size > len(data):
799 poly = size + poly
800 break
801 else:
802 raise AssertionError, "ran out of polynominals"
803
804 print size, "slots in hash table"
805
806 table = [None] * size
807
808 mask = size-1
809
810 n = 0
811
812 hash = myhash
813
814 # initialize hash table
815 for key, value in data:
816 h = hash(key, magic)
817 i = (~h) & mask
818 v = table[i]
819 if v is None:
820 table[i] = value
821 continue
822 incr = (h ^ (h >> 3)) & mask;
823 if not incr:
824 incr = mask
825 while 1:
826 n = n + 1
827 i = (i + incr) & mask
828 v = table[i]
829 if v is None:
830 table[i] = value
831 break
832 incr = incr << 1
833 if incr > mask:
834 incr = incr ^ poly
835
836 print n, "collisions"
837 self.collisions = n
838
839 for i in range(len(table)):
840 if table[i] is None:
841 table[i] = 0
842
843 self.data = Array(name + "_hash", table)
844 self.magic = magic
845 self.name = name
846 self.size = size
847 self.poly = poly
848
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000849 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000850 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000851 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000852 file.write("#define %s_magic %d\n" % (self.name, self.magic))
853 file.write("#define %s_size %d\n" % (self.name, self.size))
854 file.write("#define %s_poly %d\n" % (self.name, self.poly))
855
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000856# stuff to deal with arrays of unsigned integers
857
858class Array:
859
860 def __init__(self, name, data):
861 self.name = name
862 self.data = data
863
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000864 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000865 # write data to file, as a C array
866 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000867 if trace:
868 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000869 file.write("static ")
870 if size == 1:
871 file.write("unsigned char")
872 elif size == 2:
873 file.write("unsigned short")
874 else:
875 file.write("unsigned int")
876 file.write(" " + self.name + "[] = {\n")
877 if self.data:
878 s = " "
879 for item in self.data:
880 i = str(item) + ", "
881 if len(s) + len(i) > 78:
882 file.write(s + "\n")
883 s = " " + i
884 else:
885 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000886 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000887 file.write(s + "\n")
888 file.write("};\n\n")
889
890def getsize(data):
891 # return smallest possible integer size for the given array
892 maxdata = max(data)
893 if maxdata < 256:
894 return 1
895 elif maxdata < 65536:
896 return 2
897 else:
898 return 4
899
Tim Peters21013482000-09-25 07:13:41 +0000900def splitbins(t, trace=0):
901 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
902
903 t is a sequence of ints. This function can be useful to save space if
904 many of the ints are the same. t1 and t2 are lists of ints, and shift
905 is an int, chosen to minimize the combined size of t1 and t2 (in C
906 code), and where for each i in range(len(t)),
907 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
908 where mask is a bitmask isolating the last "shift" bits.
909
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000910 If optional arg trace is non-zero (default zero), progress info
911 is printed to sys.stderr. The higher the value, the more info
912 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000913 """
914
915 import sys
916 if trace:
917 def dump(t1, t2, shift, bytes):
918 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
919 len(t1), len(t2), shift, bytes)
920 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
921 "bytes"
922 n = len(t)-1 # last valid index
923 maxshift = 0 # the most we can shift n and still have something left
924 if n > 0:
925 while n >> 1:
926 n >>= 1
927 maxshift += 1
928 del n
929 bytes = sys.maxint # smallest total size so far
930 t = tuple(t) # so slices can be dict keys
931 for shift in range(maxshift + 1):
932 t1 = []
933 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000934 size = 2**shift
935 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000936 for i in range(0, len(t), size):
937 bin = t[i:i+size]
938 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000939 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000940 index = len(t2)
941 bincache[bin] = index
942 t2.extend(bin)
943 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000944 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000945 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000946 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000947 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000948 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000949 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000950 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000951 t1, t2, shift = best
952 if trace:
953 print >>sys.stderr, "Best:",
954 dump(t1, t2, shift, bytes)
955 if __debug__:
956 # exhaustively verify that the decomposition is correct
957 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
958 for i in xrange(len(t)):
959 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
960 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000961
962if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000963 maketables(1)