blob: 3cd5a1f842b09d74d5f3308d10576f843a5c7891 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis24329ba2008-09-10 13:38:12 +000033UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000049EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000051# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000052ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000056LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000057SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000060NODELTA_MASK = 0x100
Fredrik Lundhe9133f72000-09-25 17:59:57 +000061
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000062def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000064 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000065
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066 version = ""
67 unicode = UnicodeData(UNICODE_DATA % version,
68 COMPOSITION_EXCLUSIONS % version,
69 EASTASIAN_WIDTH % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000070
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000071 print len(filter(None, unicode.table)), "characters"
72
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000073 for version in old_versions:
74 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
75 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
76 COMPOSITION_EXCLUSIONS % ("-"+version),
77 EASTASIAN_WIDTH % ("-"+version))
78 print len(filter(None, old_unicode.table)), "characters"
79 merge_old_version(version, unicode, old_unicode)
80
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000081 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000082 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000083 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000084
85# --------------------------------------------------------------------
86# unicode character properties
87
88def makeunicodedata(unicode, trace):
89
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000090 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000091 table = [dummy]
92 cache = {0: dummy}
93 index = [0] * len(unicode.chars)
94
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000095 FILE = "Modules/unicodedata_db.h"
96
97 print "--- Preparing", FILE, "..."
98
Fredrik Lundhcfcea492000-09-25 08:07:06 +000099 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000100
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000101 for char in unicode.chars:
102 record = unicode.table[char]
103 if record:
104 # extract database properties
105 category = CATEGORY_NAMES.index(record[2])
106 combining = int(record[3])
107 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
108 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000109 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000111 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000112 )
113 # add entry to index and item tables
114 i = cache.get(item)
115 if i is None:
116 cache[item] = i = len(table)
117 table.append(item)
118 index[char] = i
119
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000120 # 2) decomposition data
121
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000122 decomp_data = [0]
123 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000125 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000126
Martin v. Löwis677bde22002-11-23 22:08:15 +0000127 comp_pairs = []
128 comp_first = [None] * len(unicode.chars)
129 comp_last = [None] * len(unicode.chars)
130
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 for char in unicode.chars:
132 record = unicode.table[char]
133 if record:
134 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000135 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000136 if len(decomp) > 19:
137 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000138 # prefix
139 if decomp[0][0] == "<":
140 prefix = decomp.pop(0)
141 else:
142 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 i = decomp_prefix.index(prefix)
145 except ValueError:
146 i = len(decomp_prefix)
147 decomp_prefix.append(prefix)
148 prefix = i
149 assert prefix < 256
150 # content
151 decomp = [prefix + (len(decomp)<<8)] +\
152 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000153 # Collect NFC pairs
154 if not prefix and len(decomp) == 3 and \
155 char not in unicode.exclusions and \
156 unicode.table[decomp[1]][3] == "0":
157 p, l, r = decomp
158 comp_first[l] = 1
159 comp_last[r] = 1
160 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000161 try:
162 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000163 except ValueError:
164 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_data.extend(decomp)
166 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 else:
168 i = 0
169 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000170
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 f = l = 0
172 comp_first_ranges = []
173 comp_last_ranges = []
174 prev_f = prev_l = None
175 for i in unicode.chars:
176 if comp_first[i] is not None:
177 comp_first[i] = f
178 f += 1
179 if prev_f is None:
180 prev_f = (i,i)
181 elif prev_f[1]+1 == i:
182 prev_f = prev_f[0],i
183 else:
184 comp_first_ranges.append(prev_f)
185 prev_f = (i,i)
186 if comp_last[i] is not None:
187 comp_last[i] = l
188 l += 1
189 if prev_l is None:
190 prev_l = (i,i)
191 elif prev_l[1]+1 == i:
192 prev_l = prev_l[0],i
193 else:
194 comp_last_ranges.append(prev_l)
195 prev_l = (i,i)
196 comp_first_ranges.append(prev_f)
197 comp_last_ranges.append(prev_l)
198 total_first = f
199 total_last = l
200
201 comp_data = [0]*(total_first*total_last)
202 for f,l,char in comp_pairs:
203 f = comp_first[f]
204 l = comp_last[l]
205 comp_data[f*total_last+l] = char
206
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000207 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 print len(decomp_prefix), "unique decomposition prefixes"
209 print len(decomp_data), "unique decomposition entries:",
210 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000211 print total_first, "first characters in NFC"
212 print total_last, "last characters in NFC"
213 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000214
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000215 print "--- Writing", FILE, "..."
216
Fred Drake9c685052000-10-26 03:56:46 +0000217 fp = open(FILE, "w")
218 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
219 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000220 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000221 print >>fp, "/* a list of unique database records */"
222 print >>fp, \
223 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000224 for item in table:
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000225 print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000226 print >>fp, "};"
227 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228
Martin v. Löwis677bde22002-11-23 22:08:15 +0000229 print >>fp, "/* Reindexing of NFC first characters. */"
230 print >>fp, "#define TOTAL_FIRST",total_first
231 print >>fp, "#define TOTAL_LAST",total_last
232 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000233 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000234 for start,end in comp_first_ranges:
235 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
236 print >>fp," {0,0,0}"
237 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000238 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000239 for start,end in comp_last_ranges:
240 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
241 print >>fp," {0,0,0}"
242 print >>fp,"};\n"
243
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000244 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000245 # the support code moved into unicodedatabase.c
246
Fred Drake9c685052000-10-26 03:56:46 +0000247 print >>fp, "/* string literals */"
248 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000249 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000250 print >>fp, " \"%s\"," % name
251 print >>fp, " NULL"
252 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253
Fred Drake9c685052000-10-26 03:56:46 +0000254 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000255 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000256 print >>fp, " \"%s\"," % name
257 print >>fp, " NULL"
258 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000259
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000260 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
261 for name in EASTASIANWIDTH_NAMES:
262 print >>fp, " \"%s\"," % name
263 print >>fp, " NULL"
264 print >>fp, "};"
265
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000266 print >>fp, "static const char *decomp_prefix[] = {"
267 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000268 print >>fp, " \"%s\"," % name
269 print >>fp, " NULL"
270 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000271
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000272 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000273 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000274
Fred Drake9c685052000-10-26 03:56:46 +0000275 print >>fp, "/* index tables for the database records */"
276 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000277 Array("index1", index1).dump(fp, trace)
278 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000279
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000280 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000281 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000282
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 print >>fp, "/* decomposition data */"
284 Array("decomp_data", decomp_data).dump(fp, trace)
285
Fred Drake9c685052000-10-26 03:56:46 +0000286 print >>fp, "/* index tables for the decomposition data */"
287 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 Array("decomp_index1", index1).dump(fp, trace)
289 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Martin v. Löwis677bde22002-11-23 22:08:15 +0000291 index, index2, shift = splitbins(comp_data, trace)
292 print >>fp, "/* NFC pairs */"
293 print >>fp, "#define COMP_SHIFT", shift
294 Array("comp_index", index).dump(fp, trace)
295 Array("comp_data", index2).dump(fp, trace)
296
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000297 # Generate delta tables for old versions
298 for version, table, normalization in unicode.changed:
299 cversion = version.replace(".","_")
300 records = [table[0]]
301 cache = {table[0]:0}
302 index = [0] * len(table)
303 for i, record in enumerate(table):
304 try:
305 index[i] = cache[record]
306 except KeyError:
307 index[i] = cache[record] = len(records)
308 records.append(record)
309 index1, index2, shift = splitbins(index, trace)
310 print >>fp, "static const change_record change_records_%s[] = {" % cversion
311 for record in records:
312 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
313 print >>fp, "};"
314 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
315 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
316 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
317 print >>fp, "{"
318 print >>fp, "\tint index;"
319 print >>fp, "\tif (n >= 0x110000) index = 0;"
320 print >>fp, "\telse {"
321 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
322 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
323 (cversion, shift, ((1<<shift)-1))
324 print >>fp, "\t}"
325 print >>fp, "\treturn change_records_%s+index;" % cversion
326 print >>fp, "}\n"
327 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
328 print >>fp, "{"
329 print >>fp, "\tswitch(n) {"
330 for k, v in normalization:
331 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
332 print >>fp, "\tdefault: return 0;"
333 print >>fp, "\t}\n}\n"
334
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000335 fp.close()
336
337# --------------------------------------------------------------------
338# unicode character type tables
339
340def makeunicodetype(unicode, trace):
341
342 FILE = "Objects/unicodetype_db.h"
343
344 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000345
346 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000347 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000348 table = [dummy]
349 cache = {0: dummy}
350 index = [0] * len(unicode.chars)
351
352 for char in unicode.chars:
353 record = unicode.table[char]
354 if record:
355 # extract database properties
356 category = record[2]
357 bidirectional = record[4]
358 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000359 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000360 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
361 flags |= ALPHA_MASK
362 if category == "Ll":
363 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000364 if category == "Zl" or bidirectional == "B":
365 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 if category == "Zs" or bidirectional in ("WS", "B", "S"):
367 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000368 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000369 flags |= TITLE_MASK
370 if category == "Lu":
371 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000372 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000374 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000376 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000378 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000380 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000382 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000384 # UCD.html says that a missing title char means that
385 # it defaults to the uppercase character, not to the
386 # character itself. Apparently, in the current UCD (5.x)
387 # this feature is never used
388 title = upper
389 upper_d = upper - char
390 lower_d = lower - char
391 title_d = title - char
392 if -32768 <= upper_d <= 32767 and \
393 -32768 <= lower_d <= 32767 and \
394 -32768 <= title_d <= 32767:
395 # use deltas
396 upper = upper_d & 0xffff
397 lower = lower_d & 0xffff
398 title = title_d & 0xffff
399 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000400 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000401 # decimal digit, integer digit
402 decimal = 0
403 if record[6]:
404 flags |= DECIMAL_MASK
405 decimal = int(record[6])
406 digit = 0
407 if record[7]:
408 flags |= DIGIT_MASK
409 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000411 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 )
413 # add entry to index and item tables
414 i = cache.get(item)
415 if i is None:
416 cache[item] = i = len(table)
417 table.append(item)
418 index[char] = i
419
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000420 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000422 print "--- Writing", FILE, "..."
423
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000424 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000425 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
426 print >>fp
427 print >>fp, "/* a list of unique character type descriptors */"
428 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000429 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000430 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
431 print >>fp, "};"
432 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000433
434 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000435 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000436
Fred Drake9c685052000-10-26 03:56:46 +0000437 print >>fp, "/* type indexes */"
438 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439 Array("index1", index1).dump(fp, trace)
440 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000441
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000442 fp.close()
443
444# --------------------------------------------------------------------
445# unicode name database
446
447def makeunicodename(unicode, trace):
448
449 FILE = "Modules/unicodename_db.h"
450
451 print "--- Preparing", FILE, "..."
452
453 # collect names
454 names = [None] * len(unicode.chars)
455
456 for char in unicode.chars:
457 record = unicode.table[char]
458 if record:
459 name = record[1].strip()
460 if name and name[0] != "<":
461 names[char] = name + chr(0)
462
463 print len(filter(lambda n: n is not None, names)), "distinct names"
464
465 # collect unique words from names (note that we differ between
466 # words inside a sentence, and words ending a sentence. the
467 # latter includes the trailing null byte.
468
469 words = {}
470 n = b = 0
471 for char in unicode.chars:
472 name = names[char]
473 if name:
474 w = name.split()
475 b = b + len(name)
476 n = n + len(w)
477 for w in w:
478 l = words.get(w)
479 if l:
480 l.append(None)
481 else:
482 words[w] = [len(words)]
483
484 print n, "words in text;", b, "bytes"
485
486 wordlist = words.items()
487
Martin v. Löwis97225da2002-11-24 23:05:09 +0000488 # sort on falling frequency, then by name
489 def cmpwords((aword, alist),(bword, blist)):
490 r = -cmp(len(alist),len(blist))
491 if r:
492 return r
493 return cmp(aword, bword)
494 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000495
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000496 # figure out how many phrasebook escapes we need
497 escapes = 0
498 while escapes * 256 < len(wordlist):
499 escapes = escapes + 1
500 print escapes, "escapes"
501
502 short = 256 - escapes
503
504 assert short > 0
505
506 print short, "short indexes in lexicon"
507
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000508 # statistics
509 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000510 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000511 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000512 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000513
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000514 # pick the most commonly used words, and sort the rest on falling
515 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000516
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000517 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000518 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
519 wordlist.extend(wordtail)
520
521 # generate lexicon from words
522
523 lexicon_offset = [0]
524 lexicon = ""
525 words = {}
526
527 # build a lexicon string
528 offset = 0
529 for w, x in wordlist:
530 # encoding: bit 7 indicates last character in word (chr(128)
531 # indicates the last character in an entire string)
532 ww = w[:-1] + chr(ord(w[-1])+128)
533 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000534 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000535 if o < 0:
536 o = offset
537 lexicon = lexicon + ww
538 offset = offset + len(w)
539 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000540 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000541
542 lexicon = map(ord, lexicon)
543
544 # generate phrasebook from names and lexicon
545 phrasebook = [0]
546 phrasebook_offset = [0] * len(unicode.chars)
547 for char in unicode.chars:
548 name = names[char]
549 if name:
550 w = name.split()
551 phrasebook_offset[char] = len(phrasebook)
552 for w in w:
553 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000554 if i < short:
555 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000556 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000557 # store as two bytes
558 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000559 phrasebook.append(i&255)
560
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000561 assert getsize(phrasebook) == 1
562
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000563 #
564 # unicode name hash table
565
566 # extract names
567 data = []
568 for char in unicode.chars:
569 record = unicode.table[char]
570 if record:
571 name = record[1].strip()
572 if name and name[0] != "<":
573 data.append((name, char))
574
575 # the magic number 47 was chosen to minimize the number of
576 # collisions on the current data set. if you like, change it
577 # and see what happens...
578
579 codehash = Hash("code", data, 47)
580
581 print "--- Writing", FILE, "..."
582
583 fp = open(FILE, "w")
584 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
585 print >>fp
586 print >>fp, "#define NAME_MAXLEN", 256
587 print >>fp
588 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000589 Array("lexicon", lexicon).dump(fp, trace)
590 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000591
592 # split decomposition index table
593 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
594
595 print >>fp, "/* code->name phrasebook */"
596 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000597 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000598
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000599 Array("phrasebook", phrasebook).dump(fp, trace)
600 Array("phrasebook_offset1", offset1).dump(fp, trace)
601 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
603 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000604 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 fp.close()
607
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000608
609def merge_old_version(version, new, old):
610 # Changes to exclusion file not implemented yet
611 if old.exclusions != new.exclusions:
612 raise NotImplementedError, "exclusions differ"
613
614 # In these change records, 0xFF means "no change"
615 bidir_changes = [0xFF]*0x110000
616 category_changes = [0xFF]*0x110000
617 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000618 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000619 # In numeric data, 0 means "no change",
620 # -1 means "did not have a numeric value
621 numeric_changes = [0] * 0x110000
622 # normalization_changes is a list of key-value pairs
623 normalization_changes = []
624 for i in range(0x110000):
625 if new.table[i] is None:
626 # Characters unassigned in the new version ought to
627 # be unassigned in the old one
628 assert old.table[i] is None
629 continue
630 # check characters unassigned in the old version
631 if old.table[i] is None:
632 # category 0 is "unassigned"
633 category_changes[i] = 0
634 continue
635 # check characters that differ
636 if old.table[i] != new.table[i]:
637 for k in range(len(old.table[i])):
638 if old.table[i][k] != new.table[i][k]:
639 value = old.table[i][k]
640 if k == 2:
641 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
642 category_changes[i] = CATEGORY_NAMES.index(value)
643 elif k == 4:
644 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
645 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
646 elif k == 5:
647 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
648 # We assume that all normalization changes are in 1:1 mappings
649 assert " " not in value
650 normalization_changes.append((i, value))
651 elif k == 6:
652 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
653 # we only support changes where the old value is a single digit
654 assert value in "0123456789"
655 decimal_changes[i] = int(value)
656 elif k == 8:
657 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
658 # Since 0 encodes "no change", the old value is better not 0
659 assert value != "0" and value != "-1"
660 if not value:
661 numeric_changes[i] = -1
662 else:
663 assert re.match("^[0-9]+$", value)
664 numeric_changes[i] = int(value)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000665 elif k == 9:
666 if value == 'Y':
667 mirrored_changes[i] = '1'
668 else:
669 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000670 elif k == 11:
671 # change to ISO comment, ignore
672 pass
673 elif k == 12:
674 # change to simple uppercase mapping; ignore
675 pass
676 elif k == 13:
677 # change to simple lowercase mapping; ignore
678 pass
679 elif k == 14:
680 # change to simple titlecase mapping; ignore
681 pass
682 else:
683 class Difference(Exception):pass
684 raise Difference, (hex(i), k, old.table[i], new.table[i])
685 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000686 decimal_changes, mirrored_changes,
687 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000688 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000689
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000690
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000691# --------------------------------------------------------------------
692# the following support code is taken from the unidb utilities
693# Copyright (c) 1999-2000 by Secret Labs AB
694
695# load a unicode-data file from disk
696
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000697import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000698
699class UnicodeData:
700
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000701 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000703 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000704 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000705 while 1:
706 s = file.readline()
707 if not s:
708 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000709 s = s.strip().split(";")
710 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000711 table[char] = s
712
Martin v. Löwis97225da2002-11-24 23:05:09 +0000713 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000714 if expand:
715 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000716 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000717 s = table[i]
718 if s:
719 if s[1][-6:] == "First>":
720 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000721 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000722 elif s[1][-5:] == "Last>":
723 s[1] = ""
724 field = None
725 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000726 f2 = field[:]
727 f2[0] = "%X" % i
728 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000729
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000730 # public attributes
731 self.filename = filename
732 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000733 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000734
Martin v. Löwis677bde22002-11-23 22:08:15 +0000735 file = open(exclusions)
736 self.exclusions = {}
737 for s in file:
738 s = s.strip()
739 if not s:
740 continue
741 if s[0] == '#':
742 continue
743 char = int(s.split()[0],16)
744 self.exclusions[char] = 1
745
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000746 widths = [None] * 0x110000
747 for s in open(eastasianwidth):
748 s = s.strip()
749 if not s:
750 continue
751 if s[0] == '#':
752 continue
753 s = s.split()[0].split(';')
754 if '..' in s[0]:
755 first, last = [int(c, 16) for c in s[0].split('..')]
756 chars = range(first, last+1)
757 else:
758 chars = [int(s[0], 16)]
759 for char in chars:
760 widths[char] = s[1]
761 for i in range(0, 0x110000):
762 if table[i] is not None:
763 table[i].append(widths[i])
764
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000765 def uselatin1(self):
766 # restrict character range to ISO Latin 1
767 self.chars = range(256)
768
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000769# hash table tools
770
771# this is a straight-forward reimplementation of Python's built-in
772# dictionary type, using a static data structure, and a custom string
773# hash algorithm.
774
775def myhash(s, magic):
776 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000777 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000778 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000779 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000780 if ix:
781 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
782 return h
783
784SIZES = [
785 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
786 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
787 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
788 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
789]
790
791class Hash:
792 def __init__(self, name, data, magic):
793 # turn a (key, value) list into a static hash table structure
794
795 # determine table size
796 for size, poly in SIZES:
797 if size > len(data):
798 poly = size + poly
799 break
800 else:
801 raise AssertionError, "ran out of polynominals"
802
803 print size, "slots in hash table"
804
805 table = [None] * size
806
807 mask = size-1
808
809 n = 0
810
811 hash = myhash
812
813 # initialize hash table
814 for key, value in data:
815 h = hash(key, magic)
816 i = (~h) & mask
817 v = table[i]
818 if v is None:
819 table[i] = value
820 continue
821 incr = (h ^ (h >> 3)) & mask;
822 if not incr:
823 incr = mask
824 while 1:
825 n = n + 1
826 i = (i + incr) & mask
827 v = table[i]
828 if v is None:
829 table[i] = value
830 break
831 incr = incr << 1
832 if incr > mask:
833 incr = incr ^ poly
834
835 print n, "collisions"
836 self.collisions = n
837
838 for i in range(len(table)):
839 if table[i] is None:
840 table[i] = 0
841
842 self.data = Array(name + "_hash", table)
843 self.magic = magic
844 self.name = name
845 self.size = size
846 self.poly = poly
847
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000848 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000849 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000850 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000851 file.write("#define %s_magic %d\n" % (self.name, self.magic))
852 file.write("#define %s_size %d\n" % (self.name, self.size))
853 file.write("#define %s_poly %d\n" % (self.name, self.poly))
854
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000855# stuff to deal with arrays of unsigned integers
856
857class Array:
858
859 def __init__(self, name, data):
860 self.name = name
861 self.data = data
862
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000863 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000864 # write data to file, as a C array
865 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000866 if trace:
867 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000868 file.write("static ")
869 if size == 1:
870 file.write("unsigned char")
871 elif size == 2:
872 file.write("unsigned short")
873 else:
874 file.write("unsigned int")
875 file.write(" " + self.name + "[] = {\n")
876 if self.data:
877 s = " "
878 for item in self.data:
879 i = str(item) + ", "
880 if len(s) + len(i) > 78:
881 file.write(s + "\n")
882 s = " " + i
883 else:
884 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000885 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000886 file.write(s + "\n")
887 file.write("};\n\n")
888
889def getsize(data):
890 # return smallest possible integer size for the given array
891 maxdata = max(data)
892 if maxdata < 256:
893 return 1
894 elif maxdata < 65536:
895 return 2
896 else:
897 return 4
898
Tim Peters21013482000-09-25 07:13:41 +0000899def splitbins(t, trace=0):
900 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
901
902 t is a sequence of ints. This function can be useful to save space if
903 many of the ints are the same. t1 and t2 are lists of ints, and shift
904 is an int, chosen to minimize the combined size of t1 and t2 (in C
905 code), and where for each i in range(len(t)),
906 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
907 where mask is a bitmask isolating the last "shift" bits.
908
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000909 If optional arg trace is non-zero (default zero), progress info
910 is printed to sys.stderr. The higher the value, the more info
911 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000912 """
913
914 import sys
915 if trace:
916 def dump(t1, t2, shift, bytes):
917 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
918 len(t1), len(t2), shift, bytes)
919 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
920 "bytes"
921 n = len(t)-1 # last valid index
922 maxshift = 0 # the most we can shift n and still have something left
923 if n > 0:
924 while n >> 1:
925 n >>= 1
926 maxshift += 1
927 del n
928 bytes = sys.maxint # smallest total size so far
929 t = tuple(t) # so slices can be dict keys
930 for shift in range(maxshift + 1):
931 t1 = []
932 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000933 size = 2**shift
934 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000935 for i in range(0, len(t), size):
936 bin = t[i:i+size]
937 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000938 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000939 index = len(t2)
940 bincache[bin] = index
941 t2.extend(bin)
942 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000943 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000944 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000945 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000946 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000947 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000948 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000949 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000950 t1, t2, shift = best
951 if trace:
952 print >>sys.stderr, "Best:",
953 dump(t1, t2, shift, bytes)
954 if __debug__:
955 # exhaustively verify that the decomposition is correct
956 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
957 for i in xrange(len(t)):
958 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
959 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000960
961if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000962 maketables(1)