blob: 0eeb335ede675fc14859c4207be6c044390f9a7a [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Fredrik Lundhcfcea492000-09-25 08:07:06 +000016#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000017# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000018#
19
20import sys
21
22SCRIPT = sys.argv[0]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000023VERSION = "2.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000024
Fredrik Lundhe9133f72000-09-25 17:59:57 +000025UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026
27CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
28 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
29 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
30 "So" ]
31
32BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
33 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
34 "ON" ]
35
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000036# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000037ALPHA_MASK = 0x01
38DECIMAL_MASK = 0x02
39DIGIT_MASK = 0x04
40LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000041LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000042SPACE_MASK = 0x20
43TITLE_MASK = 0x40
44UPPER_MASK = 0x80
45
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000046def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000047
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000048 print "--- Reading", UNICODE_DATA, "..."
49
Fredrik Lundhf367cac2000-09-24 23:18:31 +000050 unicode = UnicodeData(UNICODE_DATA)
51
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000052 print len(filter(None, unicode.table)), "characters"
53
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000054 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000055 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000056 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000057
58# --------------------------------------------------------------------
59# unicode character properties
60
61def makeunicodedata(unicode, trace):
62
Fredrik Lundhcfcea492000-09-25 08:07:06 +000063 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064 table = [dummy]
65 cache = {0: dummy}
66 index = [0] * len(unicode.chars)
67
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068 FILE = "Modules/unicodedata_db.h"
69
70 print "--- Preparing", FILE, "..."
71
Fredrik Lundhcfcea492000-09-25 08:07:06 +000072 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000073
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074 for char in unicode.chars:
75 record = unicode.table[char]
76 if record:
77 # extract database properties
78 category = CATEGORY_NAMES.index(record[2])
79 combining = int(record[3])
80 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
81 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000082 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000083 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000084 )
85 # add entry to index and item tables
86 i = cache.get(item)
87 if i is None:
88 cache[item] = i = len(table)
89 table.append(item)
90 index[char] = i
91
Fredrik Lundhcfcea492000-09-25 08:07:06 +000092 # 2) decomposition data
93
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000094 decomp_data = [0]
95 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000096 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000097 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +000098
99 for char in unicode.chars:
100 record = unicode.table[char]
101 if record:
102 if record[5]:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000103 decomp = string.split(record[5])
104 # prefix
105 if decomp[0][0] == "<":
106 prefix = decomp.pop(0)
107 else:
108 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000109 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000110 i = decomp_prefix.index(prefix)
111 except ValueError:
112 i = len(decomp_prefix)
113 decomp_prefix.append(prefix)
114 prefix = i
115 assert prefix < 256
116 # content
117 decomp = [prefix + (len(decomp)<<8)] +\
118 map(lambda s: int(s, 16), decomp)
119 try:
120 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000121 except ValueError:
122 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000123 decomp_data.extend(decomp)
124 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000125 else:
126 i = 0
127 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000129 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 print len(decomp_prefix), "unique decomposition prefixes"
131 print len(decomp_data), "unique decomposition entries:",
132 print decomp_size, "bytes"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000133
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000134 print "--- Writing", FILE, "..."
135
Fred Drake9c685052000-10-26 03:56:46 +0000136 fp = open(FILE, "w")
137 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
138 print >>fp
139 print >>fp, "/* a list of unique database records */"
140 print >>fp, \
141 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000142 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000143 print >>fp, " {%d, %d, %d, %d}," % item
144 print >>fp, "};"
145 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000146
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000147 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000148 # the support code moved into unicodedatabase.c
149
Fred Drake9c685052000-10-26 03:56:46 +0000150 print >>fp, "/* string literals */"
151 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000153 print >>fp, " \"%s\"," % name
154 print >>fp, " NULL"
155 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000156
Fred Drake9c685052000-10-26 03:56:46 +0000157 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000158 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000159 print >>fp, " \"%s\"," % name
160 print >>fp, " NULL"
161 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000162
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 print >>fp, "static const char *decomp_prefix[] = {"
164 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000165 print >>fp, " \"%s\"," % name
166 print >>fp, " NULL"
167 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000169 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000170 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000171
Fred Drake9c685052000-10-26 03:56:46 +0000172 print >>fp, "/* index tables for the database records */"
173 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 Array("index1", index1).dump(fp, trace)
175 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000176
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000177 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000178 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 print >>fp, "/* decomposition data */"
181 Array("decomp_data", decomp_data).dump(fp, trace)
182
Fred Drake9c685052000-10-26 03:56:46 +0000183 print >>fp, "/* index tables for the decomposition data */"
184 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 Array("decomp_index1", index1).dump(fp, trace)
186 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000187
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000188 fp.close()
189
190# --------------------------------------------------------------------
191# unicode character type tables
192
193def makeunicodetype(unicode, trace):
194
195 FILE = "Objects/unicodetype_db.h"
196
197 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000198
199 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000200 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000201 table = [dummy]
202 cache = {0: dummy}
203 index = [0] * len(unicode.chars)
204
205 for char in unicode.chars:
206 record = unicode.table[char]
207 if record:
208 # extract database properties
209 category = record[2]
210 bidirectional = record[4]
211 flags = 0
212 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
213 flags |= ALPHA_MASK
214 if category == "Ll":
215 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000216 if category == "Zl" or bidirectional == "B":
217 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000218 if category == "Zs" or bidirectional in ("WS", "B", "S"):
219 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000220 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000221 flags |= TITLE_MASK
222 if category == "Lu":
223 flags |= UPPER_MASK
224 # use delta predictor for upper/lower/title
225 if record[12]:
226 upper = (int(record[12], 16) - char) & 0xffff
227 else:
228 upper = 0
229 if record[13]:
230 lower = (int(record[13], 16) - char) & 0xffff
231 else:
232 lower = 0
233 if record[14]:
234 title = (int(record[14], 16) - char) & 0xffff
235 else:
236 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000237 # decimal digit, integer digit
238 decimal = 0
239 if record[6]:
240 flags |= DECIMAL_MASK
241 decimal = int(record[6])
242 digit = 0
243 if record[7]:
244 flags |= DIGIT_MASK
245 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000246 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000247 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000248 )
249 # add entry to index and item tables
250 i = cache.get(item)
251 if i is None:
252 cache[item] = i = len(table)
253 table.append(item)
254 index[char] = i
255
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000256 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000257
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000258 print "--- Writing", FILE, "..."
259
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000260 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000261 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
262 print >>fp
263 print >>fp, "/* a list of unique character type descriptors */"
264 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000265 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000266 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
267 print >>fp, "};"
268 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000269
270 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000271 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000272
Fred Drake9c685052000-10-26 03:56:46 +0000273 print >>fp, "/* type indexes */"
274 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000275 Array("index1", index1).dump(fp, trace)
276 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000277
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000278 fp.close()
279
280# --------------------------------------------------------------------
281# unicode name database
282
283def makeunicodename(unicode, trace):
284
285 FILE = "Modules/unicodename_db.h"
286
287 print "--- Preparing", FILE, "..."
288
289 # collect names
290 names = [None] * len(unicode.chars)
291
292 for char in unicode.chars:
293 record = unicode.table[char]
294 if record:
295 name = record[1].strip()
296 if name and name[0] != "<":
297 names[char] = name + chr(0)
298
299 print len(filter(lambda n: n is not None, names)), "distinct names"
300
301 # collect unique words from names (note that we differ between
302 # words inside a sentence, and words ending a sentence. the
303 # latter includes the trailing null byte.
304
305 words = {}
306 n = b = 0
307 for char in unicode.chars:
308 name = names[char]
309 if name:
310 w = name.split()
311 b = b + len(name)
312 n = n + len(w)
313 for w in w:
314 l = words.get(w)
315 if l:
316 l.append(None)
317 else:
318 words[w] = [len(words)]
319
320 print n, "words in text;", b, "bytes"
321
322 wordlist = words.items()
323
324 # sort on falling frequency
325 wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
326
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327 # figure out how many phrasebook escapes we need
328 escapes = 0
329 while escapes * 256 < len(wordlist):
330 escapes = escapes + 1
331 print escapes, "escapes"
332
333 short = 256 - escapes
334
335 assert short > 0
336
337 print short, "short indexes in lexicon"
338
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000339 # statistics
340 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000342 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000343 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000344
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000345 # pick the most commonly used words, and sort the rest on falling
346 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000347
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000348 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000349 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
350 wordlist.extend(wordtail)
351
352 # generate lexicon from words
353
354 lexicon_offset = [0]
355 lexicon = ""
356 words = {}
357
358 # build a lexicon string
359 offset = 0
360 for w, x in wordlist:
361 # encoding: bit 7 indicates last character in word (chr(128)
362 # indicates the last character in an entire string)
363 ww = w[:-1] + chr(ord(w[-1])+128)
364 # reuse string tails, when possible
365 o = string.find(lexicon, ww)
366 if o < 0:
367 o = offset
368 lexicon = lexicon + ww
369 offset = offset + len(w)
370 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000371 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000372
373 lexicon = map(ord, lexicon)
374
375 # generate phrasebook from names and lexicon
376 phrasebook = [0]
377 phrasebook_offset = [0] * len(unicode.chars)
378 for char in unicode.chars:
379 name = names[char]
380 if name:
381 w = name.split()
382 phrasebook_offset[char] = len(phrasebook)
383 for w in w:
384 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000385 if i < short:
386 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000387 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000388 # store as two bytes
389 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000390 phrasebook.append(i&255)
391
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000392 assert getsize(phrasebook) == 1
393
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000394 #
395 # unicode name hash table
396
397 # extract names
398 data = []
399 for char in unicode.chars:
400 record = unicode.table[char]
401 if record:
402 name = record[1].strip()
403 if name and name[0] != "<":
404 data.append((name, char))
405
406 # the magic number 47 was chosen to minimize the number of
407 # collisions on the current data set. if you like, change it
408 # and see what happens...
409
410 codehash = Hash("code", data, 47)
411
412 print "--- Writing", FILE, "..."
413
414 fp = open(FILE, "w")
415 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
416 print >>fp
417 print >>fp, "#define NAME_MAXLEN", 256
418 print >>fp
419 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000420 Array("lexicon", lexicon).dump(fp, trace)
421 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000422
423 # split decomposition index table
424 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
425
426 print >>fp, "/* code->name phrasebook */"
427 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000428 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000429
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000430 Array("phrasebook", phrasebook).dump(fp, trace)
431 Array("phrasebook_offset1", offset1).dump(fp, trace)
432 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000433
434 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000436
437 fp.close()
438
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000439# --------------------------------------------------------------------
440# the following support code is taken from the unidb utilities
441# Copyright (c) 1999-2000 by Secret Labs AB
442
443# load a unicode-data file from disk
444
445import string, sys
446
447class UnicodeData:
448
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000449 def __init__(self, filename, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000450 file = open(filename)
451 table = [None] * 65536
452 while 1:
453 s = file.readline()
454 if not s:
455 break
456 s = string.split(string.strip(s), ";")
457 char = string.atoi(s[0], 16)
458 table[char] = s
459
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000460 # expand first-last ranges (ignore surrogates and private use)
461 if expand:
462 field = None
463 for i in range(0, 0xD800):
464 s = table[i]
465 if s:
466 if s[1][-6:] == "First>":
467 s[1] = ""
468 field = s[:]
469 elif s[1][-5:] == "Last>":
470 s[1] = ""
471 field = None
472 elif field:
473 field[0] = hex(i)
474 table[i] = field
475
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000476 # public attributes
477 self.filename = filename
478 self.table = table
479 self.chars = range(65536) # unicode
480
481 def uselatin1(self):
482 # restrict character range to ISO Latin 1
483 self.chars = range(256)
484
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000485# hash table tools
486
487# this is a straight-forward reimplementation of Python's built-in
488# dictionary type, using a static data structure, and a custom string
489# hash algorithm.
490
491def myhash(s, magic):
492 h = 0
493 for c in map(ord, string.upper(s)):
494 h = (h * magic) + c
495 ix = h & 0xff000000
496 if ix:
497 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
498 return h
499
500SIZES = [
501 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
502 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
503 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
504 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
505]
506
507class Hash:
508 def __init__(self, name, data, magic):
509 # turn a (key, value) list into a static hash table structure
510
511 # determine table size
512 for size, poly in SIZES:
513 if size > len(data):
514 poly = size + poly
515 break
516 else:
517 raise AssertionError, "ran out of polynominals"
518
519 print size, "slots in hash table"
520
521 table = [None] * size
522
523 mask = size-1
524
525 n = 0
526
527 hash = myhash
528
529 # initialize hash table
530 for key, value in data:
531 h = hash(key, magic)
532 i = (~h) & mask
533 v = table[i]
534 if v is None:
535 table[i] = value
536 continue
537 incr = (h ^ (h >> 3)) & mask;
538 if not incr:
539 incr = mask
540 while 1:
541 n = n + 1
542 i = (i + incr) & mask
543 v = table[i]
544 if v is None:
545 table[i] = value
546 break
547 incr = incr << 1
548 if incr > mask:
549 incr = incr ^ poly
550
551 print n, "collisions"
552 self.collisions = n
553
554 for i in range(len(table)):
555 if table[i] is None:
556 table[i] = 0
557
558 self.data = Array(name + "_hash", table)
559 self.magic = magic
560 self.name = name
561 self.size = size
562 self.poly = poly
563
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000564 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000565 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000566 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567 file.write("#define %s_magic %d\n" % (self.name, self.magic))
568 file.write("#define %s_size %d\n" % (self.name, self.size))
569 file.write("#define %s_poly %d\n" % (self.name, self.poly))
570
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000571# stuff to deal with arrays of unsigned integers
572
573class Array:
574
575 def __init__(self, name, data):
576 self.name = name
577 self.data = data
578
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000579 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000580 # write data to file, as a C array
581 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000582 if trace:
583 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000584 file.write("static ")
585 if size == 1:
586 file.write("unsigned char")
587 elif size == 2:
588 file.write("unsigned short")
589 else:
590 file.write("unsigned int")
591 file.write(" " + self.name + "[] = {\n")
592 if self.data:
593 s = " "
594 for item in self.data:
595 i = str(item) + ", "
596 if len(s) + len(i) > 78:
597 file.write(s + "\n")
598 s = " " + i
599 else:
600 s = s + i
601 if string.strip(s):
602 file.write(s + "\n")
603 file.write("};\n\n")
604
605def getsize(data):
606 # return smallest possible integer size for the given array
607 maxdata = max(data)
608 if maxdata < 256:
609 return 1
610 elif maxdata < 65536:
611 return 2
612 else:
613 return 4
614
Tim Peters21013482000-09-25 07:13:41 +0000615def splitbins(t, trace=0):
616 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
617
618 t is a sequence of ints. This function can be useful to save space if
619 many of the ints are the same. t1 and t2 are lists of ints, and shift
620 is an int, chosen to minimize the combined size of t1 and t2 (in C
621 code), and where for each i in range(len(t)),
622 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
623 where mask is a bitmask isolating the last "shift" bits.
624
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000625 If optional arg trace is non-zero (default zero), progress info
626 is printed to sys.stderr. The higher the value, the more info
627 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000628 """
629
630 import sys
631 if trace:
632 def dump(t1, t2, shift, bytes):
633 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
634 len(t1), len(t2), shift, bytes)
635 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
636 "bytes"
637 n = len(t)-1 # last valid index
638 maxshift = 0 # the most we can shift n and still have something left
639 if n > 0:
640 while n >> 1:
641 n >>= 1
642 maxshift += 1
643 del n
644 bytes = sys.maxint # smallest total size so far
645 t = tuple(t) # so slices can be dict keys
646 for shift in range(maxshift + 1):
647 t1 = []
648 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000649 size = 2**shift
650 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000651 for i in range(0, len(t), size):
652 bin = t[i:i+size]
653 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000654 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000655 index = len(t2)
656 bincache[bin] = index
657 t2.extend(bin)
658 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000659 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000660 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000661 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000662 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000663 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000664 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000665 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000666 t1, t2, shift = best
667 if trace:
668 print >>sys.stderr, "Best:",
669 dump(t1, t2, shift, bytes)
670 if __debug__:
671 # exhaustively verify that the decomposition is correct
672 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
673 for i in xrange(len(t)):
674 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
675 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000676
677if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000678 maketables(1)