blob: 3a362ec7f02174314e10434d5bf0df034a141f4f [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundhcfcea492000-09-25 08:07:06 +000015#
16# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundhf367cac2000-09-24 23:18:31 +000017#
18
19import sys
20
21SCRIPT = sys.argv[0]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000022VERSION = "2.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000023
Fredrik Lundhe9133f72000-09-25 17:59:57 +000024UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025
26CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
27 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
28 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
29 "So" ]
30
31BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
32 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
33 "ON" ]
34
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000035# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000036ALPHA_MASK = 0x01
37DECIMAL_MASK = 0x02
38DIGIT_MASK = 0x04
39LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000040LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000041SPACE_MASK = 0x20
42TITLE_MASK = 0x40
43UPPER_MASK = 0x80
44
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000045def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000046
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000047 print "--- Reading", UNICODE_DATA, "..."
48
Fredrik Lundhf367cac2000-09-24 23:18:31 +000049 unicode = UnicodeData(UNICODE_DATA)
50
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000051 print len(filter(None, unicode.table)), "characters"
52
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000053 makeunicodedata(unicode, trace)
54 makeunicodetype(unicode, trace)
55 makeunicodename(unicode, trace)
56
57# --------------------------------------------------------------------
58# unicode character properties
59
60def makeunicodedata(unicode, trace):
61
Fredrik Lundhcfcea492000-09-25 08:07:06 +000062 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063 table = [dummy]
64 cache = {0: dummy}
65 index = [0] * len(unicode.chars)
66
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000067 FILE = "Modules/unicodedata_db.h"
68
69 print "--- Preparing", FILE, "..."
70
Fredrik Lundhcfcea492000-09-25 08:07:06 +000071 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000072
Fredrik Lundhf367cac2000-09-24 23:18:31 +000073 for char in unicode.chars:
74 record = unicode.table[char]
75 if record:
76 # extract database properties
77 category = CATEGORY_NAMES.index(record[2])
78 combining = int(record[3])
79 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
80 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000081 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000082 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000083 )
84 # add entry to index and item tables
85 i = cache.get(item)
86 if i is None:
87 cache[item] = i = len(table)
88 table.append(item)
89 index[char] = i
90
Fredrik Lundhcfcea492000-09-25 08:07:06 +000091 # 2) decomposition data
92
93 # FIXME: <fl> using the encoding stuff from unidb would save
94 # another 50k or so, but I'll leave that for 2.1...
95
96 decomp_data = [""]
97 decomp_index = [0] * len(unicode.chars)
98
99 for char in unicode.chars:
100 record = unicode.table[char]
101 if record:
102 if record[5]:
103 try:
104 i = decomp_data.index(record[5])
105 except ValueError:
106 i = len(decomp_data)
107 decomp_data.append(record[5])
108 else:
109 i = 0
110 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000111
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112 print len(table), "unique properties"
113 print len(decomp_data), "unique decomposition entries"
114
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115 print "--- Writing", FILE, "..."
116
Fred Drake9c685052000-10-26 03:56:46 +0000117 fp = open(FILE, "w")
118 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
119 print >>fp
120 print >>fp, "/* a list of unique database records */"
121 print >>fp, \
122 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000123 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000124 print >>fp, " {%d, %d, %d, %d}," % item
125 print >>fp, "};"
126 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000128 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000129 # the support code moved into unicodedatabase.c
130
Fred Drake9c685052000-10-26 03:56:46 +0000131 print >>fp, "/* string literals */"
132 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000133 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000134 print >>fp, " \"%s\"," % name
135 print >>fp, " NULL"
136 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000137
Fred Drake9c685052000-10-26 03:56:46 +0000138 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000139 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000140 print >>fp, " \"%s\"," % name
141 print >>fp, " NULL"
142 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000143
Fred Drake9c685052000-10-26 03:56:46 +0000144 print >>fp, "static const char *decomp_data[] = {"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 for name in decomp_data:
Fred Drake9c685052000-10-26 03:56:46 +0000146 print >>fp, " \"%s\"," % name
147 print >>fp, " NULL"
148 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000150 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000151 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152
Fred Drake9c685052000-10-26 03:56:46 +0000153 print >>fp, "/* index tables for the database records */"
154 print >>fp, "#define SHIFT", shift
155 Array("index1", index1).dump(fp)
156 Array("index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000158 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000159 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000160
Fred Drake9c685052000-10-26 03:56:46 +0000161 print >>fp, "/* index tables for the decomposition data */"
162 print >>fp, "#define DECOMP_SHIFT", shift
163 Array("decomp_index1", index1).dump(fp)
164 Array("decomp_index2", index2).dump(fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000165
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000166 fp.close()
167
168# --------------------------------------------------------------------
169# unicode character type tables
170
171def makeunicodetype(unicode, trace):
172
173 FILE = "Objects/unicodetype_db.h"
174
175 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000176
177 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000178 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000179 table = [dummy]
180 cache = {0: dummy}
181 index = [0] * len(unicode.chars)
182
183 for char in unicode.chars:
184 record = unicode.table[char]
185 if record:
186 # extract database properties
187 category = record[2]
188 bidirectional = record[4]
189 flags = 0
190 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
191 flags |= ALPHA_MASK
192 if category == "Ll":
193 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000194 if category == "Zl" or bidirectional == "B":
195 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000196 if category == "Zs" or bidirectional in ("WS", "B", "S"):
197 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000198 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000199 flags |= TITLE_MASK
200 if category == "Lu":
201 flags |= UPPER_MASK
202 # use delta predictor for upper/lower/title
203 if record[12]:
204 upper = (int(record[12], 16) - char) & 0xffff
205 else:
206 upper = 0
207 if record[13]:
208 lower = (int(record[13], 16) - char) & 0xffff
209 else:
210 lower = 0
211 if record[14]:
212 title = (int(record[14], 16) - char) & 0xffff
213 else:
214 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000215 # decimal digit, integer digit
216 decimal = 0
217 if record[6]:
218 flags |= DECIMAL_MASK
219 decimal = int(record[6])
220 digit = 0
221 if record[7]:
222 flags |= DIGIT_MASK
223 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000224 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000225 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000226 )
227 # add entry to index and item tables
228 i = cache.get(item)
229 if i is None:
230 cache[item] = i = len(table)
231 table.append(item)
232 index[char] = i
233
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000234 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000235
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000236 print "--- Writing", FILE, "..."
237
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000238 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000239 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
240 print >>fp
241 print >>fp, "/* a list of unique character type descriptors */"
242 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000243 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000244 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
245 print >>fp, "};"
246 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000247
248 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000249 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000250
Fred Drake9c685052000-10-26 03:56:46 +0000251 print >>fp, "/* type indexes */"
252 print >>fp, "#define SHIFT", shift
253 Array("index1", index1).dump(fp)
254 Array("index2", index2).dump(fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000255
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000256 fp.close()
257
258# --------------------------------------------------------------------
259# unicode name database
260
261def makeunicodename(unicode, trace):
262
263 FILE = "Modules/unicodename_db.h"
264
265 print "--- Preparing", FILE, "..."
266
267 # collect names
268 names = [None] * len(unicode.chars)
269
270 for char in unicode.chars:
271 record = unicode.table[char]
272 if record:
273 name = record[1].strip()
274 if name and name[0] != "<":
275 names[char] = name + chr(0)
276
277 print len(filter(lambda n: n is not None, names)), "distinct names"
278
279 # collect unique words from names (note that we differ between
280 # words inside a sentence, and words ending a sentence. the
281 # latter includes the trailing null byte.
282
283 words = {}
284 n = b = 0
285 for char in unicode.chars:
286 name = names[char]
287 if name:
288 w = name.split()
289 b = b + len(name)
290 n = n + len(w)
291 for w in w:
292 l = words.get(w)
293 if l:
294 l.append(None)
295 else:
296 words[w] = [len(words)]
297
298 print n, "words in text;", b, "bytes"
299
300 wordlist = words.items()
301
302 # sort on falling frequency
303 wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
304
305 # statistics
306 n = 0
307 for i in range(128):
308 n = n + len(wordlist[i][1])
309 print n, "short words (7-bit indices)"
310
311 # pick the 128 most commonly used words, and sort the rest on
312 # falling length (to maximize overlap)
313
314 wordlist, wordtail = wordlist[:128], wordlist[128:]
315 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
316 wordlist.extend(wordtail)
317
318 # generate lexicon from words
319
320 lexicon_offset = [0]
321 lexicon = ""
322 words = {}
323
324 # build a lexicon string
325 offset = 0
326 for w, x in wordlist:
327 # encoding: bit 7 indicates last character in word (chr(128)
328 # indicates the last character in an entire string)
329 ww = w[:-1] + chr(ord(w[-1])+128)
330 # reuse string tails, when possible
331 o = string.find(lexicon, ww)
332 if o < 0:
333 o = offset
334 lexicon = lexicon + ww
335 offset = offset + len(w)
336 words[w] = len(lexicon_offset)
337 lexicon_offset.append(offset)
338
339 print len(words), "words in lexicon;", len(lexicon), "bytes"
340
341 assert len(words) < 32768 # 15-bit word indices
342
343 lexicon = map(ord, lexicon)
344
345 # generate phrasebook from names and lexicon
346 phrasebook = [0]
347 phrasebook_offset = [0] * len(unicode.chars)
348 for char in unicode.chars:
349 name = names[char]
350 if name:
351 w = name.split()
352 phrasebook_offset[char] = len(phrasebook)
353 for w in w:
354 i = words[w]
355 if i < 128:
356 phrasebook.append(128+i)
357 else:
358 phrasebook.append(i>>8)
359 phrasebook.append(i&255)
360
361 #
362 # unicode name hash table
363
364 # extract names
365 data = []
366 for char in unicode.chars:
367 record = unicode.table[char]
368 if record:
369 name = record[1].strip()
370 if name and name[0] != "<":
371 data.append((name, char))
372
373 # the magic number 47 was chosen to minimize the number of
374 # collisions on the current data set. if you like, change it
375 # and see what happens...
376
377 codehash = Hash("code", data, 47)
378
379 print "--- Writing", FILE, "..."
380
381 fp = open(FILE, "w")
382 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
383 print >>fp
384 print >>fp, "#define NAME_MAXLEN", 256
385 print >>fp
386 print >>fp, "/* lexicon */"
387 Array("lexicon", lexicon).dump(fp)
388 Array("lexicon_offset", lexicon_offset).dump(fp)
389
390 # split decomposition index table
391 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
392
393 print >>fp, "/* code->name phrasebook */"
394 print >>fp, "#define phrasebook_shift", shift
395
396 Array("phrasebook", phrasebook).dump(fp)
397 Array("phrasebook_offset1", offset1).dump(fp)
398 Array("phrasebook_offset2", offset2).dump(fp)
399
400 print >>fp, "/* name->code dictionary */"
401 codehash.dump(fp)
402
403 fp.close()
404
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000405# --------------------------------------------------------------------
406# the following support code is taken from the unidb utilities
407# Copyright (c) 1999-2000 by Secret Labs AB
408
409# load a unicode-data file from disk
410
411import string, sys
412
413class UnicodeData:
414
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000415 def __init__(self, filename, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000416 file = open(filename)
417 table = [None] * 65536
418 while 1:
419 s = file.readline()
420 if not s:
421 break
422 s = string.split(string.strip(s), ";")
423 char = string.atoi(s[0], 16)
424 table[char] = s
425
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000426 # expand first-last ranges (ignore surrogates and private use)
427 if expand:
428 field = None
429 for i in range(0, 0xD800):
430 s = table[i]
431 if s:
432 if s[1][-6:] == "First>":
433 s[1] = ""
434 field = s[:]
435 elif s[1][-5:] == "Last>":
436 s[1] = ""
437 field = None
438 elif field:
439 field[0] = hex(i)
440 table[i] = field
441
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000442 # public attributes
443 self.filename = filename
444 self.table = table
445 self.chars = range(65536) # unicode
446
447 def uselatin1(self):
448 # restrict character range to ISO Latin 1
449 self.chars = range(256)
450
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000451# hash table tools
452
453# this is a straight-forward reimplementation of Python's built-in
454# dictionary type, using a static data structure, and a custom string
455# hash algorithm.
456
457def myhash(s, magic):
458 h = 0
459 for c in map(ord, string.upper(s)):
460 h = (h * magic) + c
461 ix = h & 0xff000000
462 if ix:
463 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
464 return h
465
466SIZES = [
467 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
468 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
469 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
470 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
471]
472
473class Hash:
474 def __init__(self, name, data, magic):
475 # turn a (key, value) list into a static hash table structure
476
477 # determine table size
478 for size, poly in SIZES:
479 if size > len(data):
480 poly = size + poly
481 break
482 else:
483 raise AssertionError, "ran out of polynominals"
484
485 print size, "slots in hash table"
486
487 table = [None] * size
488
489 mask = size-1
490
491 n = 0
492
493 hash = myhash
494
495 # initialize hash table
496 for key, value in data:
497 h = hash(key, magic)
498 i = (~h) & mask
499 v = table[i]
500 if v is None:
501 table[i] = value
502 continue
503 incr = (h ^ (h >> 3)) & mask;
504 if not incr:
505 incr = mask
506 while 1:
507 n = n + 1
508 i = (i + incr) & mask
509 v = table[i]
510 if v is None:
511 table[i] = value
512 break
513 incr = incr << 1
514 if incr > mask:
515 incr = incr ^ poly
516
517 print n, "collisions"
518 self.collisions = n
519
520 for i in range(len(table)):
521 if table[i] is None:
522 table[i] = 0
523
524 self.data = Array(name + "_hash", table)
525 self.magic = magic
526 self.name = name
527 self.size = size
528 self.poly = poly
529
530 def dump(self, file):
531 # write data to file, as a C array
532 self.data.dump(file)
533 file.write("#define %s_magic %d\n" % (self.name, self.magic))
534 file.write("#define %s_size %d\n" % (self.name, self.size))
535 file.write("#define %s_poly %d\n" % (self.name, self.poly))
536
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000537# stuff to deal with arrays of unsigned integers
538
539class Array:
540
541 def __init__(self, name, data):
542 self.name = name
543 self.data = data
544
545 def dump(self, file):
546 # write data to file, as a C array
547 size = getsize(self.data)
548 # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
549 file.write("static ")
550 if size == 1:
551 file.write("unsigned char")
552 elif size == 2:
553 file.write("unsigned short")
554 else:
555 file.write("unsigned int")
556 file.write(" " + self.name + "[] = {\n")
557 if self.data:
558 s = " "
559 for item in self.data:
560 i = str(item) + ", "
561 if len(s) + len(i) > 78:
562 file.write(s + "\n")
563 s = " " + i
564 else:
565 s = s + i
566 if string.strip(s):
567 file.write(s + "\n")
568 file.write("};\n\n")
569
570def getsize(data):
571 # return smallest possible integer size for the given array
572 maxdata = max(data)
573 if maxdata < 256:
574 return 1
575 elif maxdata < 65536:
576 return 2
577 else:
578 return 4
579
Tim Peters21013482000-09-25 07:13:41 +0000580def splitbins(t, trace=0):
581 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
582
583 t is a sequence of ints. This function can be useful to save space if
584 many of the ints are the same. t1 and t2 are lists of ints, and shift
585 is an int, chosen to minimize the combined size of t1 and t2 (in C
586 code), and where for each i in range(len(t)),
587 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
588 where mask is a bitmask isolating the last "shift" bits.
589
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000590 If optional arg trace is non-zero (default zero), progress info
591 is printed to sys.stderr. The higher the value, the more info
592 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000593 """
594
595 import sys
596 if trace:
597 def dump(t1, t2, shift, bytes):
598 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
599 len(t1), len(t2), shift, bytes)
600 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
601 "bytes"
602 n = len(t)-1 # last valid index
603 maxshift = 0 # the most we can shift n and still have something left
604 if n > 0:
605 while n >> 1:
606 n >>= 1
607 maxshift += 1
608 del n
609 bytes = sys.maxint # smallest total size so far
610 t = tuple(t) # so slices can be dict keys
611 for shift in range(maxshift + 1):
612 t1 = []
613 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000614 size = 2**shift
615 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000616 for i in range(0, len(t), size):
617 bin = t[i:i+size]
618 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000619 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000620 index = len(t2)
621 bincache[bin] = index
622 t2.extend(bin)
623 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000624 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000625 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000626 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000627 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000628 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000629 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000630 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000631 t1, t2, shift = best
632 if trace:
633 print >>sys.stderr, "Best:",
634 dump(t1, t2, shift, bytes)
635 if __debug__:
636 # exhaustively verify that the decomposition is correct
637 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
638 for i in xrange(len(t)):
639 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
640 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000641
642if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000643 maketables(1)