blob: e3842e576a0a54d10b302dede2225c185376eb58 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis24329ba2008-09-10 13:38:12 +000033UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Antoine Pitroue988e282009-04-27 21:53:26 +000037DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038
39old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000040
41CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
42 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
43 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
44 "So" ]
45
46BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
47 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
48 "ON" ]
49
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000050EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
51
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000052# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000053ALPHA_MASK = 0x01
54DECIMAL_MASK = 0x02
55DIGIT_MASK = 0x04
56LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000057LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000058SPACE_MASK = 0x20
59TITLE_MASK = 0x40
60UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000061NODELTA_MASK = 0x100
Fredrik Lundhe9133f72000-09-25 17:59:57 +000062
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000063def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000065 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000066
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067 version = ""
68 unicode = UnicodeData(UNICODE_DATA % version,
69 COMPOSITION_EXCLUSIONS % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000070 EASTASIAN_WIDTH % version,
71 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000072
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000073 print len(filter(None, unicode.table)), "characters"
74
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000075 for version in old_versions:
76 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
77 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
78 COMPOSITION_EXCLUSIONS % ("-"+version),
79 EASTASIAN_WIDTH % ("-"+version))
80 print len(filter(None, old_unicode.table)), "characters"
81 merge_old_version(version, unicode, old_unicode)
82
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000083 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000084 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000085 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000086
87# --------------------------------------------------------------------
88# unicode character properties
89
90def makeunicodedata(unicode, trace):
91
Antoine Pitroue988e282009-04-27 21:53:26 +000092 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000093 table = [dummy]
94 cache = {0: dummy}
95 index = [0] * len(unicode.chars)
96
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000097 FILE = "Modules/unicodedata_db.h"
98
99 print "--- Preparing", FILE, "..."
100
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000101 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000103 for char in unicode.chars:
104 record = unicode.table[char]
105 if record:
106 # extract database properties
107 category = CATEGORY_NAMES.index(record[2])
108 combining = int(record[3])
109 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
110 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000111 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitroue988e282009-04-27 21:53:26 +0000112 normalizationquickcheck = record[16]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113 item = (
Antoine Pitroue988e282009-04-27 21:53:26 +0000114 category, combining, bidirectional, mirrored, eastasianwidth,
115 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116 )
117 # add entry to index and item tables
118 i = cache.get(item)
119 if i is None:
120 cache[item] = i = len(table)
121 table.append(item)
122 index[char] = i
123
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124 # 2) decomposition data
125
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 decomp_data = [0]
127 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130
Martin v. Löwis677bde22002-11-23 22:08:15 +0000131 comp_pairs = []
132 comp_first = [None] * len(unicode.chars)
133 comp_last = [None] * len(unicode.chars)
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 for char in unicode.chars:
136 record = unicode.table[char]
137 if record:
138 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000139 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 if len(decomp) > 19:
141 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142 # prefix
143 if decomp[0][0] == "<":
144 prefix = decomp.pop(0)
145 else:
146 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000147 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000148 i = decomp_prefix.index(prefix)
149 except ValueError:
150 i = len(decomp_prefix)
151 decomp_prefix.append(prefix)
152 prefix = i
153 assert prefix < 256
154 # content
155 decomp = [prefix + (len(decomp)<<8)] +\
156 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000157 # Collect NFC pairs
158 if not prefix and len(decomp) == 3 and \
159 char not in unicode.exclusions and \
160 unicode.table[decomp[1]][3] == "0":
161 p, l, r = decomp
162 comp_first[l] = 1
163 comp_last[r] = 1
164 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 try:
166 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 except ValueError:
168 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 decomp_data.extend(decomp)
170 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 else:
172 i = 0
173 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000174
Martin v. Löwis677bde22002-11-23 22:08:15 +0000175 f = l = 0
176 comp_first_ranges = []
177 comp_last_ranges = []
178 prev_f = prev_l = None
179 for i in unicode.chars:
180 if comp_first[i] is not None:
181 comp_first[i] = f
182 f += 1
183 if prev_f is None:
184 prev_f = (i,i)
185 elif prev_f[1]+1 == i:
186 prev_f = prev_f[0],i
187 else:
188 comp_first_ranges.append(prev_f)
189 prev_f = (i,i)
190 if comp_last[i] is not None:
191 comp_last[i] = l
192 l += 1
193 if prev_l is None:
194 prev_l = (i,i)
195 elif prev_l[1]+1 == i:
196 prev_l = prev_l[0],i
197 else:
198 comp_last_ranges.append(prev_l)
199 prev_l = (i,i)
200 comp_first_ranges.append(prev_f)
201 comp_last_ranges.append(prev_l)
202 total_first = f
203 total_last = l
204
205 comp_data = [0]*(total_first*total_last)
206 for f,l,char in comp_pairs:
207 f = comp_first[f]
208 l = comp_last[l]
209 comp_data[f*total_last+l] = char
210
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000211 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 print len(decomp_prefix), "unique decomposition prefixes"
213 print len(decomp_data), "unique decomposition entries:",
214 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000215 print total_first, "first characters in NFC"
216 print total_last, "last characters in NFC"
217 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000218
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000219 print "--- Writing", FILE, "..."
220
Fred Drake9c685052000-10-26 03:56:46 +0000221 fp = open(FILE, "w")
222 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
223 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000224 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000225 print >>fp, "/* a list of unique database records */"
226 print >>fp, \
227 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228 for item in table:
Antoine Pitroue988e282009-04-27 21:53:26 +0000229 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000230 print >>fp, "};"
231 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232
Martin v. Löwis677bde22002-11-23 22:08:15 +0000233 print >>fp, "/* Reindexing of NFC first characters. */"
234 print >>fp, "#define TOTAL_FIRST",total_first
235 print >>fp, "#define TOTAL_LAST",total_last
236 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000237 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000238 for start,end in comp_first_ranges:
239 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
240 print >>fp," {0,0,0}"
241 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000242 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000243 for start,end in comp_last_ranges:
244 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
245 print >>fp," {0,0,0}"
246 print >>fp,"};\n"
247
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000248 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000249 # the support code moved into unicodedatabase.c
250
Fred Drake9c685052000-10-26 03:56:46 +0000251 print >>fp, "/* string literals */"
252 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000254 print >>fp, " \"%s\"," % name
255 print >>fp, " NULL"
256 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Fred Drake9c685052000-10-26 03:56:46 +0000258 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000259 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000260 print >>fp, " \"%s\"," % name
261 print >>fp, " NULL"
262 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000264 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
265 for name in EASTASIANWIDTH_NAMES:
266 print >>fp, " \"%s\"," % name
267 print >>fp, " NULL"
268 print >>fp, "};"
269
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000270 print >>fp, "static const char *decomp_prefix[] = {"
271 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000272 print >>fp, " \"%s\"," % name
273 print >>fp, " NULL"
274 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000275
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000276 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000277 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000278
Fred Drake9c685052000-10-26 03:56:46 +0000279 print >>fp, "/* index tables for the database records */"
280 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281 Array("index1", index1).dump(fp, trace)
282 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000283
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000284 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000285 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000286
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000287 print >>fp, "/* decomposition data */"
288 Array("decomp_data", decomp_data).dump(fp, trace)
289
Fred Drake9c685052000-10-26 03:56:46 +0000290 print >>fp, "/* index tables for the decomposition data */"
291 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000292 Array("decomp_index1", index1).dump(fp, trace)
293 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Martin v. Löwis677bde22002-11-23 22:08:15 +0000295 index, index2, shift = splitbins(comp_data, trace)
296 print >>fp, "/* NFC pairs */"
297 print >>fp, "#define COMP_SHIFT", shift
298 Array("comp_index", index).dump(fp, trace)
299 Array("comp_data", index2).dump(fp, trace)
300
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000301 # Generate delta tables for old versions
302 for version, table, normalization in unicode.changed:
303 cversion = version.replace(".","_")
304 records = [table[0]]
305 cache = {table[0]:0}
306 index = [0] * len(table)
307 for i, record in enumerate(table):
308 try:
309 index[i] = cache[record]
310 except KeyError:
311 index[i] = cache[record] = len(records)
312 records.append(record)
313 index1, index2, shift = splitbins(index, trace)
314 print >>fp, "static const change_record change_records_%s[] = {" % cversion
315 for record in records:
316 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
317 print >>fp, "};"
318 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
319 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
320 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
321 print >>fp, "{"
322 print >>fp, "\tint index;"
323 print >>fp, "\tif (n >= 0x110000) index = 0;"
324 print >>fp, "\telse {"
325 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
326 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
327 (cversion, shift, ((1<<shift)-1))
328 print >>fp, "\t}"
329 print >>fp, "\treturn change_records_%s+index;" % cversion
330 print >>fp, "}\n"
331 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
332 print >>fp, "{"
333 print >>fp, "\tswitch(n) {"
334 for k, v in normalization:
335 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
336 print >>fp, "\tdefault: return 0;"
337 print >>fp, "\t}\n}\n"
338
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000339 fp.close()
340
341# --------------------------------------------------------------------
342# unicode character type tables
343
344def makeunicodetype(unicode, trace):
345
346 FILE = "Objects/unicodetype_db.h"
347
348 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000349
350 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000351 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352 table = [dummy]
353 cache = {0: dummy}
354 index = [0] * len(unicode.chars)
355
356 for char in unicode.chars:
357 record = unicode.table[char]
358 if record:
359 # extract database properties
360 category = record[2]
361 bidirectional = record[4]
362 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000363 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
365 flags |= ALPHA_MASK
366 if category == "Ll":
367 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000368 if category == "Zl" or bidirectional == "B":
369 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370 if category == "Zs" or bidirectional in ("WS", "B", "S"):
371 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000372 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 flags |= TITLE_MASK
374 if category == "Lu":
375 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000376 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000378 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000380 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000382 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000384 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000386 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000388 # UCD.html says that a missing title char means that
389 # it defaults to the uppercase character, not to the
390 # character itself. Apparently, in the current UCD (5.x)
391 # this feature is never used
392 title = upper
393 upper_d = upper - char
394 lower_d = lower - char
395 title_d = title - char
396 if -32768 <= upper_d <= 32767 and \
397 -32768 <= lower_d <= 32767 and \
398 -32768 <= title_d <= 32767:
399 # use deltas
400 upper = upper_d & 0xffff
401 lower = lower_d & 0xffff
402 title = title_d & 0xffff
403 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000404 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000405 # decimal digit, integer digit
406 decimal = 0
407 if record[6]:
408 flags |= DECIMAL_MASK
409 decimal = int(record[6])
410 digit = 0
411 if record[7]:
412 flags |= DIGIT_MASK
413 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000415 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 )
417 # add entry to index and item tables
418 i = cache.get(item)
419 if i is None:
420 cache[item] = i = len(table)
421 table.append(item)
422 index[char] = i
423
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000424 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000425
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000426 print "--- Writing", FILE, "..."
427
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000428 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000429 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
430 print >>fp
431 print >>fp, "/* a list of unique character type descriptors */"
432 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000433 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000434 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
435 print >>fp, "};"
436 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000437
438 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000439 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440
Fred Drake9c685052000-10-26 03:56:46 +0000441 print >>fp, "/* type indexes */"
442 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 Array("index1", index1).dump(fp, trace)
444 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000445
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000446 fp.close()
447
448# --------------------------------------------------------------------
449# unicode name database
450
451def makeunicodename(unicode, trace):
452
453 FILE = "Modules/unicodename_db.h"
454
455 print "--- Preparing", FILE, "..."
456
457 # collect names
458 names = [None] * len(unicode.chars)
459
460 for char in unicode.chars:
461 record = unicode.table[char]
462 if record:
463 name = record[1].strip()
464 if name and name[0] != "<":
465 names[char] = name + chr(0)
466
467 print len(filter(lambda n: n is not None, names)), "distinct names"
468
469 # collect unique words from names (note that we differ between
470 # words inside a sentence, and words ending a sentence. the
471 # latter includes the trailing null byte.
472
473 words = {}
474 n = b = 0
475 for char in unicode.chars:
476 name = names[char]
477 if name:
478 w = name.split()
479 b = b + len(name)
480 n = n + len(w)
481 for w in w:
482 l = words.get(w)
483 if l:
484 l.append(None)
485 else:
486 words[w] = [len(words)]
487
488 print n, "words in text;", b, "bytes"
489
490 wordlist = words.items()
491
Martin v. Löwis97225da2002-11-24 23:05:09 +0000492 # sort on falling frequency, then by name
493 def cmpwords((aword, alist),(bword, blist)):
494 r = -cmp(len(alist),len(blist))
495 if r:
496 return r
497 return cmp(aword, bword)
498 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000499
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000500 # figure out how many phrasebook escapes we need
501 escapes = 0
502 while escapes * 256 < len(wordlist):
503 escapes = escapes + 1
504 print escapes, "escapes"
505
506 short = 256 - escapes
507
508 assert short > 0
509
510 print short, "short indexes in lexicon"
511
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000512 # statistics
513 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000514 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000515 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000516 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000517
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000518 # pick the most commonly used words, and sort the rest on falling
519 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000520
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000521 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000522 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
523 wordlist.extend(wordtail)
524
525 # generate lexicon from words
526
527 lexicon_offset = [0]
528 lexicon = ""
529 words = {}
530
531 # build a lexicon string
532 offset = 0
533 for w, x in wordlist:
534 # encoding: bit 7 indicates last character in word (chr(128)
535 # indicates the last character in an entire string)
536 ww = w[:-1] + chr(ord(w[-1])+128)
537 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000538 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000539 if o < 0:
540 o = offset
541 lexicon = lexicon + ww
542 offset = offset + len(w)
543 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000544 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000545
546 lexicon = map(ord, lexicon)
547
548 # generate phrasebook from names and lexicon
549 phrasebook = [0]
550 phrasebook_offset = [0] * len(unicode.chars)
551 for char in unicode.chars:
552 name = names[char]
553 if name:
554 w = name.split()
555 phrasebook_offset[char] = len(phrasebook)
556 for w in w:
557 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000558 if i < short:
559 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000560 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000561 # store as two bytes
562 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000563 phrasebook.append(i&255)
564
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000565 assert getsize(phrasebook) == 1
566
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567 #
568 # unicode name hash table
569
570 # extract names
571 data = []
572 for char in unicode.chars:
573 record = unicode.table[char]
574 if record:
575 name = record[1].strip()
576 if name and name[0] != "<":
577 data.append((name, char))
578
579 # the magic number 47 was chosen to minimize the number of
580 # collisions on the current data set. if you like, change it
581 # and see what happens...
582
583 codehash = Hash("code", data, 47)
584
585 print "--- Writing", FILE, "..."
586
587 fp = open(FILE, "w")
588 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
589 print >>fp
590 print >>fp, "#define NAME_MAXLEN", 256
591 print >>fp
592 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000593 Array("lexicon", lexicon).dump(fp, trace)
594 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000595
596 # split decomposition index table
597 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
598
599 print >>fp, "/* code->name phrasebook */"
600 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000601 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000603 Array("phrasebook", phrasebook).dump(fp, trace)
604 Array("phrasebook_offset1", offset1).dump(fp, trace)
605 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
607 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000608 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000609
610 fp.close()
611
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000612
613def merge_old_version(version, new, old):
614 # Changes to exclusion file not implemented yet
615 if old.exclusions != new.exclusions:
616 raise NotImplementedError, "exclusions differ"
617
618 # In these change records, 0xFF means "no change"
619 bidir_changes = [0xFF]*0x110000
620 category_changes = [0xFF]*0x110000
621 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000622 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000623 # In numeric data, 0 means "no change",
624 # -1 means "did not have a numeric value
625 numeric_changes = [0] * 0x110000
626 # normalization_changes is a list of key-value pairs
627 normalization_changes = []
628 for i in range(0x110000):
629 if new.table[i] is None:
630 # Characters unassigned in the new version ought to
631 # be unassigned in the old one
632 assert old.table[i] is None
633 continue
634 # check characters unassigned in the old version
635 if old.table[i] is None:
636 # category 0 is "unassigned"
637 category_changes[i] = 0
638 continue
639 # check characters that differ
640 if old.table[i] != new.table[i]:
641 for k in range(len(old.table[i])):
642 if old.table[i][k] != new.table[i][k]:
643 value = old.table[i][k]
644 if k == 2:
645 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
646 category_changes[i] = CATEGORY_NAMES.index(value)
647 elif k == 4:
648 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
649 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
650 elif k == 5:
651 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
652 # We assume that all normalization changes are in 1:1 mappings
653 assert " " not in value
654 normalization_changes.append((i, value))
655 elif k == 6:
656 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
657 # we only support changes where the old value is a single digit
658 assert value in "0123456789"
659 decimal_changes[i] = int(value)
660 elif k == 8:
661 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
662 # Since 0 encodes "no change", the old value is better not 0
663 assert value != "0" and value != "-1"
664 if not value:
665 numeric_changes[i] = -1
666 else:
667 assert re.match("^[0-9]+$", value)
668 numeric_changes[i] = int(value)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000669 elif k == 9:
670 if value == 'Y':
671 mirrored_changes[i] = '1'
672 else:
673 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000674 elif k == 11:
675 # change to ISO comment, ignore
676 pass
677 elif k == 12:
678 # change to simple uppercase mapping; ignore
679 pass
680 elif k == 13:
681 # change to simple lowercase mapping; ignore
682 pass
683 elif k == 14:
684 # change to simple titlecase mapping; ignore
685 pass
686 else:
687 class Difference(Exception):pass
688 raise Difference, (hex(i), k, old.table[i], new.table[i])
689 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000690 decimal_changes, mirrored_changes,
691 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000692 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000693
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000694
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000695# --------------------------------------------------------------------
696# the following support code is taken from the unidb utilities
697# Copyright (c) 1999-2000 by Secret Labs AB
698
699# load a unicode-data file from disk
700
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000701import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000702
703class UnicodeData:
704
Antoine Pitroue988e282009-04-27 21:53:26 +0000705 def __init__(self, filename, exclusions, eastasianwidth,
706 derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000707 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000708 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000709 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000710 while 1:
711 s = file.readline()
712 if not s:
713 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000714 s = s.strip().split(";")
715 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000716 table[char] = s
717
Martin v. Löwis97225da2002-11-24 23:05:09 +0000718 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000719 if expand:
720 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000721 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000722 s = table[i]
723 if s:
724 if s[1][-6:] == "First>":
725 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000726 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000727 elif s[1][-5:] == "Last>":
728 s[1] = ""
729 field = None
730 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000731 f2 = field[:]
732 f2[0] = "%X" % i
733 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000734
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000735 # public attributes
736 self.filename = filename
737 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000738 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000739
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 file = open(exclusions)
741 self.exclusions = {}
742 for s in file:
743 s = s.strip()
744 if not s:
745 continue
746 if s[0] == '#':
747 continue
748 char = int(s.split()[0],16)
749 self.exclusions[char] = 1
750
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000751 widths = [None] * 0x110000
752 for s in open(eastasianwidth):
753 s = s.strip()
754 if not s:
755 continue
756 if s[0] == '#':
757 continue
758 s = s.split()[0].split(';')
759 if '..' in s[0]:
760 first, last = [int(c, 16) for c in s[0].split('..')]
761 chars = range(first, last+1)
762 else:
763 chars = [int(s[0], 16)]
764 for char in chars:
765 widths[char] = s[1]
766 for i in range(0, 0x110000):
767 if table[i] is not None:
768 table[i].append(widths[i])
Antoine Pitroue988e282009-04-27 21:53:26 +0000769 if derivednormalizationprops:
770 quickchecks = [0] * 0x110000 # default is Yes
771 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
772 for s in open(derivednormalizationprops):
773 if '#' in s:
774 s = s[:s.index('#')]
775 s = [i.strip() for i in s.split(';')]
776 if len(s) < 2 or s[1] not in qc_order:
777 continue
778 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
779 quickcheck_shift = qc_order.index(s[1])*2
780 quickcheck <<= quickcheck_shift
781 if '..' not in s[0]:
782 first = last = int(s[0], 16)
783 else:
784 first, last = [int(c, 16) for c in s[0].split('..')]
785 for char in range(first, last+1):
786 assert not (quickchecks[char]>>quickcheck_shift)&3
787 quickchecks[char] |= quickcheck
788 for i in range(0, 0x110000):
789 if table[i] is not None:
790 table[i].append(quickchecks[i])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000791
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000792 def uselatin1(self):
793 # restrict character range to ISO Latin 1
794 self.chars = range(256)
795
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000796# hash table tools
797
798# this is a straight-forward reimplementation of Python's built-in
799# dictionary type, using a static data structure, and a custom string
800# hash algorithm.
801
802def myhash(s, magic):
803 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000804 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000805 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000806 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000807 if ix:
808 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
809 return h
810
811SIZES = [
812 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
813 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
814 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
815 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
816]
817
818class Hash:
819 def __init__(self, name, data, magic):
820 # turn a (key, value) list into a static hash table structure
821
822 # determine table size
823 for size, poly in SIZES:
824 if size > len(data):
825 poly = size + poly
826 break
827 else:
828 raise AssertionError, "ran out of polynominals"
829
830 print size, "slots in hash table"
831
832 table = [None] * size
833
834 mask = size-1
835
836 n = 0
837
838 hash = myhash
839
840 # initialize hash table
841 for key, value in data:
842 h = hash(key, magic)
843 i = (~h) & mask
844 v = table[i]
845 if v is None:
846 table[i] = value
847 continue
848 incr = (h ^ (h >> 3)) & mask;
849 if not incr:
850 incr = mask
851 while 1:
852 n = n + 1
853 i = (i + incr) & mask
854 v = table[i]
855 if v is None:
856 table[i] = value
857 break
858 incr = incr << 1
859 if incr > mask:
860 incr = incr ^ poly
861
862 print n, "collisions"
863 self.collisions = n
864
865 for i in range(len(table)):
866 if table[i] is None:
867 table[i] = 0
868
869 self.data = Array(name + "_hash", table)
870 self.magic = magic
871 self.name = name
872 self.size = size
873 self.poly = poly
874
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000875 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000876 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000877 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000878 file.write("#define %s_magic %d\n" % (self.name, self.magic))
879 file.write("#define %s_size %d\n" % (self.name, self.size))
880 file.write("#define %s_poly %d\n" % (self.name, self.poly))
881
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000882# stuff to deal with arrays of unsigned integers
883
884class Array:
885
886 def __init__(self, name, data):
887 self.name = name
888 self.data = data
889
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000890 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000891 # write data to file, as a C array
892 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000893 if trace:
894 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000895 file.write("static ")
896 if size == 1:
897 file.write("unsigned char")
898 elif size == 2:
899 file.write("unsigned short")
900 else:
901 file.write("unsigned int")
902 file.write(" " + self.name + "[] = {\n")
903 if self.data:
904 s = " "
905 for item in self.data:
906 i = str(item) + ", "
907 if len(s) + len(i) > 78:
908 file.write(s + "\n")
909 s = " " + i
910 else:
911 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000912 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000913 file.write(s + "\n")
914 file.write("};\n\n")
915
916def getsize(data):
917 # return smallest possible integer size for the given array
918 maxdata = max(data)
919 if maxdata < 256:
920 return 1
921 elif maxdata < 65536:
922 return 2
923 else:
924 return 4
925
Tim Peters21013482000-09-25 07:13:41 +0000926def splitbins(t, trace=0):
927 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
928
929 t is a sequence of ints. This function can be useful to save space if
930 many of the ints are the same. t1 and t2 are lists of ints, and shift
931 is an int, chosen to minimize the combined size of t1 and t2 (in C
932 code), and where for each i in range(len(t)),
933 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
934 where mask is a bitmask isolating the last "shift" bits.
935
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000936 If optional arg trace is non-zero (default zero), progress info
937 is printed to sys.stderr. The higher the value, the more info
938 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000939 """
940
941 import sys
942 if trace:
943 def dump(t1, t2, shift, bytes):
944 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
945 len(t1), len(t2), shift, bytes)
946 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
947 "bytes"
948 n = len(t)-1 # last valid index
949 maxshift = 0 # the most we can shift n and still have something left
950 if n > 0:
951 while n >> 1:
952 n >>= 1
953 maxshift += 1
954 del n
955 bytes = sys.maxint # smallest total size so far
956 t = tuple(t) # so slices can be dict keys
957 for shift in range(maxshift + 1):
958 t1 = []
959 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000960 size = 2**shift
961 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000962 for i in range(0, len(t), size):
963 bin = t[i:i+size]
964 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000965 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000966 index = len(t2)
967 bincache[bin] = index
968 t2.extend(bin)
969 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000970 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000971 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000972 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000973 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000974 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000975 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000976 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000977 t1, t2, shift = best
978 if trace:
979 print >>sys.stderr, "Best:",
980 dump(t1, t2, shift, bytes)
981 if __debug__:
982 # exhaustively verify that the decomposition is correct
983 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
984 for i in xrange(len(t)):
985 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
986 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000987
988if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000989 maketables(1)