blob: 3b2fd1152d70e661427b599afc93dc5af470e365 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Fredrik Lundhcfcea492000-09-25 08:07:06 +000016#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000017# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000018#
19
20import sys
21
22SCRIPT = sys.argv[0]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000023VERSION = "2.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000024
Fredrik Lundhe9133f72000-09-25 17:59:57 +000025UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026
27CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
28 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
29 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
30 "So" ]
31
32BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
33 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
34 "ON" ]
35
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000036# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000037ALPHA_MASK = 0x01
38DECIMAL_MASK = 0x02
39DIGIT_MASK = 0x04
40LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000041LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000042SPACE_MASK = 0x20
43TITLE_MASK = 0x40
44UPPER_MASK = 0x80
45
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000046def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000047
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000048 print "--- Reading", UNICODE_DATA, "..."
49
Fredrik Lundhf367cac2000-09-24 23:18:31 +000050 unicode = UnicodeData(UNICODE_DATA)
51
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000052 print len(filter(None, unicode.table)), "characters"
53
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000054 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000055 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000056 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000057
58# --------------------------------------------------------------------
59# unicode character properties
60
61def makeunicodedata(unicode, trace):
62
Fredrik Lundhcfcea492000-09-25 08:07:06 +000063 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064 table = [dummy]
65 cache = {0: dummy}
66 index = [0] * len(unicode.chars)
67
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068 FILE = "Modules/unicodedata_db.h"
69
70 print "--- Preparing", FILE, "..."
71
Fredrik Lundhcfcea492000-09-25 08:07:06 +000072 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000073
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074 for char in unicode.chars:
75 record = unicode.table[char]
76 if record:
77 # extract database properties
78 category = CATEGORY_NAMES.index(record[2])
79 combining = int(record[3])
80 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
81 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000082 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000083 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000084 )
85 # add entry to index and item tables
86 i = cache.get(item)
87 if i is None:
88 cache[item] = i = len(table)
89 table.append(item)
90 index[char] = i
91
Fredrik Lundhcfcea492000-09-25 08:07:06 +000092 # 2) decomposition data
93
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000094 decomp_data = [0]
95 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +000096 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000097 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +000098
99 for char in unicode.chars:
100 record = unicode.table[char]
101 if record:
102 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000103 decomp = record[5].split()
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000104 # prefix
105 if decomp[0][0] == "<":
106 prefix = decomp.pop(0)
107 else:
108 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000109 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000110 i = decomp_prefix.index(prefix)
111 except ValueError:
112 i = len(decomp_prefix)
113 decomp_prefix.append(prefix)
114 prefix = i
115 assert prefix < 256
116 # content
117 decomp = [prefix + (len(decomp)<<8)] +\
118 map(lambda s: int(s, 16), decomp)
119 try:
120 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000121 except ValueError:
122 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000123 decomp_data.extend(decomp)
124 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000125 else:
126 i = 0
127 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000129 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 print len(decomp_prefix), "unique decomposition prefixes"
131 print len(decomp_data), "unique decomposition entries:",
132 print decomp_size, "bytes"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000133
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000134 print "--- Writing", FILE, "..."
135
Fred Drake9c685052000-10-26 03:56:46 +0000136 fp = open(FILE, "w")
137 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
138 print >>fp
139 print >>fp, "/* a list of unique database records */"
140 print >>fp, \
141 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000142 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000143 print >>fp, " {%d, %d, %d, %d}," % item
144 print >>fp, "};"
145 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000146
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000147 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000148 # the support code moved into unicodedatabase.c
149
Fred Drake9c685052000-10-26 03:56:46 +0000150 print >>fp, "/* string literals */"
151 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000153 print >>fp, " \"%s\"," % name
154 print >>fp, " NULL"
155 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000156
Fred Drake9c685052000-10-26 03:56:46 +0000157 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000158 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000159 print >>fp, " \"%s\"," % name
160 print >>fp, " NULL"
161 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000162
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 print >>fp, "static const char *decomp_prefix[] = {"
164 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000165 print >>fp, " \"%s\"," % name
166 print >>fp, " NULL"
167 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000169 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000170 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000171
Fred Drake9c685052000-10-26 03:56:46 +0000172 print >>fp, "/* index tables for the database records */"
173 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 Array("index1", index1).dump(fp, trace)
175 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000176
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000177 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000178 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000179
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 print >>fp, "/* decomposition data */"
181 Array("decomp_data", decomp_data).dump(fp, trace)
182
Fred Drake9c685052000-10-26 03:56:46 +0000183 print >>fp, "/* index tables for the decomposition data */"
184 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 Array("decomp_index1", index1).dump(fp, trace)
186 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000187
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000188 fp.close()
189
190# --------------------------------------------------------------------
191# unicode character type tables
192
193def makeunicodetype(unicode, trace):
194
195 FILE = "Objects/unicodetype_db.h"
196
197 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000198
199 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000200 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000201 table = [dummy]
202 cache = {0: dummy}
203 index = [0] * len(unicode.chars)
204
205 for char in unicode.chars:
206 record = unicode.table[char]
207 if record:
208 # extract database properties
209 category = record[2]
210 bidirectional = record[4]
211 flags = 0
212 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
213 flags |= ALPHA_MASK
214 if category == "Ll":
215 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000216 if category == "Zl" or bidirectional == "B":
217 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000218 if category == "Zs" or bidirectional in ("WS", "B", "S"):
219 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000220 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000221 flags |= TITLE_MASK
222 if category == "Lu":
223 flags |= UPPER_MASK
224 # use delta predictor for upper/lower/title
225 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000226 upper = int(record[12], 16) - char
227 assert -32768 <= upper <= 32767
228 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000229 else:
230 upper = 0
231 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000232 lower = int(record[13], 16) - char
233 assert -32768 <= lower <= 32767
234 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000235 else:
236 lower = 0
237 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000238 title = int(record[14], 16) - char
239 assert -32768 <= lower <= 32767
240 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000241 else:
242 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000243 # decimal digit, integer digit
244 decimal = 0
245 if record[6]:
246 flags |= DECIMAL_MASK
247 decimal = int(record[6])
248 digit = 0
249 if record[7]:
250 flags |= DIGIT_MASK
251 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000252 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000253 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000254 )
255 # add entry to index and item tables
256 i = cache.get(item)
257 if i is None:
258 cache[item] = i = len(table)
259 table.append(item)
260 index[char] = i
261
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000262 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000263
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000264 print "--- Writing", FILE, "..."
265
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000266 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000267 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
268 print >>fp
269 print >>fp, "/* a list of unique character type descriptors */"
270 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000271 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000272 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
273 print >>fp, "};"
274 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000275
276 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000277 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000278
Fred Drake9c685052000-10-26 03:56:46 +0000279 print >>fp, "/* type indexes */"
280 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281 Array("index1", index1).dump(fp, trace)
282 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000284 fp.close()
285
286# --------------------------------------------------------------------
287# unicode name database
288
289def makeunicodename(unicode, trace):
290
291 FILE = "Modules/unicodename_db.h"
292
293 print "--- Preparing", FILE, "..."
294
295 # collect names
296 names = [None] * len(unicode.chars)
297
298 for char in unicode.chars:
299 record = unicode.table[char]
300 if record:
301 name = record[1].strip()
302 if name and name[0] != "<":
303 names[char] = name + chr(0)
304
305 print len(filter(lambda n: n is not None, names)), "distinct names"
306
307 # collect unique words from names (note that we differ between
308 # words inside a sentence, and words ending a sentence. the
309 # latter includes the trailing null byte.
310
311 words = {}
312 n = b = 0
313 for char in unicode.chars:
314 name = names[char]
315 if name:
316 w = name.split()
317 b = b + len(name)
318 n = n + len(w)
319 for w in w:
320 l = words.get(w)
321 if l:
322 l.append(None)
323 else:
324 words[w] = [len(words)]
325
326 print n, "words in text;", b, "bytes"
327
328 wordlist = words.items()
329
330 # sort on falling frequency
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000331 # XXX: different Python versions produce a different order
332 # for words with equal frequency
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000333 wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
334
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000335 # figure out how many phrasebook escapes we need
336 escapes = 0
337 while escapes * 256 < len(wordlist):
338 escapes = escapes + 1
339 print escapes, "escapes"
340
341 short = 256 - escapes
342
343 assert short > 0
344
345 print short, "short indexes in lexicon"
346
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000347 # statistics
348 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000349 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000350 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000351 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000352
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000353 # pick the most commonly used words, and sort the rest on falling
354 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000355
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000356 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000357 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
358 wordlist.extend(wordtail)
359
360 # generate lexicon from words
361
362 lexicon_offset = [0]
363 lexicon = ""
364 words = {}
365
366 # build a lexicon string
367 offset = 0
368 for w, x in wordlist:
369 # encoding: bit 7 indicates last character in word (chr(128)
370 # indicates the last character in an entire string)
371 ww = w[:-1] + chr(ord(w[-1])+128)
372 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000373 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000374 if o < 0:
375 o = offset
376 lexicon = lexicon + ww
377 offset = offset + len(w)
378 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000379 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000380
381 lexicon = map(ord, lexicon)
382
383 # generate phrasebook from names and lexicon
384 phrasebook = [0]
385 phrasebook_offset = [0] * len(unicode.chars)
386 for char in unicode.chars:
387 name = names[char]
388 if name:
389 w = name.split()
390 phrasebook_offset[char] = len(phrasebook)
391 for w in w:
392 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000393 if i < short:
394 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000395 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000396 # store as two bytes
397 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000398 phrasebook.append(i&255)
399
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400 assert getsize(phrasebook) == 1
401
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000402 #
403 # unicode name hash table
404
405 # extract names
406 data = []
407 for char in unicode.chars:
408 record = unicode.table[char]
409 if record:
410 name = record[1].strip()
411 if name and name[0] != "<":
412 data.append((name, char))
413
414 # the magic number 47 was chosen to minimize the number of
415 # collisions on the current data set. if you like, change it
416 # and see what happens...
417
418 codehash = Hash("code", data, 47)
419
420 print "--- Writing", FILE, "..."
421
422 fp = open(FILE, "w")
423 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
424 print >>fp
425 print >>fp, "#define NAME_MAXLEN", 256
426 print >>fp
427 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000428 Array("lexicon", lexicon).dump(fp, trace)
429 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000430
431 # split decomposition index table
432 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
433
434 print >>fp, "/* code->name phrasebook */"
435 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000437
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000438 Array("phrasebook", phrasebook).dump(fp, trace)
439 Array("phrasebook_offset1", offset1).dump(fp, trace)
440 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000441
442 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000444
445 fp.close()
446
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000447# --------------------------------------------------------------------
448# the following support code is taken from the unidb utilities
449# Copyright (c) 1999-2000 by Secret Labs AB
450
451# load a unicode-data file from disk
452
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000453import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000454
455class UnicodeData:
456
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000457 def __init__(self, filename, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000458 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000459 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000460 while 1:
461 s = file.readline()
462 if not s:
463 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000464 s = s.strip().split(";")
465 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000466 table[char] = s
467
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000468 # expand first-last ranges (ignore surrogates and private use)
469 if expand:
470 field = None
471 for i in range(0, 0xD800):
472 s = table[i]
473 if s:
474 if s[1][-6:] == "First>":
475 s[1] = ""
476 field = s[:]
477 elif s[1][-5:] == "Last>":
478 s[1] = ""
479 field = None
480 elif field:
481 field[0] = hex(i)
482 table[i] = field
483
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000484 # public attributes
485 self.filename = filename
486 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000487 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000488
489 def uselatin1(self):
490 # restrict character range to ISO Latin 1
491 self.chars = range(256)
492
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000493# hash table tools
494
495# this is a straight-forward reimplementation of Python's built-in
496# dictionary type, using a static data structure, and a custom string
497# hash algorithm.
498
499def myhash(s, magic):
500 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000501 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000502 h = (h * magic) + c
503 ix = h & 0xff000000
504 if ix:
505 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
506 return h
507
508SIZES = [
509 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
510 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
511 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
512 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
513]
514
515class Hash:
516 def __init__(self, name, data, magic):
517 # turn a (key, value) list into a static hash table structure
518
519 # determine table size
520 for size, poly in SIZES:
521 if size > len(data):
522 poly = size + poly
523 break
524 else:
525 raise AssertionError, "ran out of polynominals"
526
527 print size, "slots in hash table"
528
529 table = [None] * size
530
531 mask = size-1
532
533 n = 0
534
535 hash = myhash
536
537 # initialize hash table
538 for key, value in data:
539 h = hash(key, magic)
540 i = (~h) & mask
541 v = table[i]
542 if v is None:
543 table[i] = value
544 continue
545 incr = (h ^ (h >> 3)) & mask;
546 if not incr:
547 incr = mask
548 while 1:
549 n = n + 1
550 i = (i + incr) & mask
551 v = table[i]
552 if v is None:
553 table[i] = value
554 break
555 incr = incr << 1
556 if incr > mask:
557 incr = incr ^ poly
558
559 print n, "collisions"
560 self.collisions = n
561
562 for i in range(len(table)):
563 if table[i] is None:
564 table[i] = 0
565
566 self.data = Array(name + "_hash", table)
567 self.magic = magic
568 self.name = name
569 self.size = size
570 self.poly = poly
571
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000572 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000573 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000574 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000575 file.write("#define %s_magic %d\n" % (self.name, self.magic))
576 file.write("#define %s_size %d\n" % (self.name, self.size))
577 file.write("#define %s_poly %d\n" % (self.name, self.poly))
578
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000579# stuff to deal with arrays of unsigned integers
580
581class Array:
582
583 def __init__(self, name, data):
584 self.name = name
585 self.data = data
586
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000587 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000588 # write data to file, as a C array
589 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590 if trace:
591 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000592 file.write("static ")
593 if size == 1:
594 file.write("unsigned char")
595 elif size == 2:
596 file.write("unsigned short")
597 else:
598 file.write("unsigned int")
599 file.write(" " + self.name + "[] = {\n")
600 if self.data:
601 s = " "
602 for item in self.data:
603 i = str(item) + ", "
604 if len(s) + len(i) > 78:
605 file.write(s + "\n")
606 s = " " + i
607 else:
608 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000609 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000610 file.write(s + "\n")
611 file.write("};\n\n")
612
613def getsize(data):
614 # return smallest possible integer size for the given array
615 maxdata = max(data)
616 if maxdata < 256:
617 return 1
618 elif maxdata < 65536:
619 return 2
620 else:
621 return 4
622
Tim Peters21013482000-09-25 07:13:41 +0000623def splitbins(t, trace=0):
624 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
625
626 t is a sequence of ints. This function can be useful to save space if
627 many of the ints are the same. t1 and t2 are lists of ints, and shift
628 is an int, chosen to minimize the combined size of t1 and t2 (in C
629 code), and where for each i in range(len(t)),
630 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
631 where mask is a bitmask isolating the last "shift" bits.
632
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000633 If optional arg trace is non-zero (default zero), progress info
634 is printed to sys.stderr. The higher the value, the more info
635 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000636 """
637
638 import sys
639 if trace:
640 def dump(t1, t2, shift, bytes):
641 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
642 len(t1), len(t2), shift, bytes)
643 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
644 "bytes"
645 n = len(t)-1 # last valid index
646 maxshift = 0 # the most we can shift n and still have something left
647 if n > 0:
648 while n >> 1:
649 n >>= 1
650 maxshift += 1
651 del n
652 bytes = sys.maxint # smallest total size so far
653 t = tuple(t) # so slices can be dict keys
654 for shift in range(maxshift + 1):
655 t1 = []
656 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000657 size = 2**shift
658 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000659 for i in range(0, len(t), size):
660 bin = t[i:i+size]
661 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000662 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000663 index = len(t2)
664 bincache[bin] = index
665 t2.extend(bin)
666 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000667 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000668 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000669 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000670 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000671 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000672 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000673 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000674 t1, t2, shift = best
675 if trace:
676 print >>sys.stderr, "Best:",
677 dump(t1, t2, shift, bytes)
678 if __debug__:
679 # exhaustively verify that the decomposition is correct
680 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
681 for i in xrange(len(t)):
682 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
683 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000684
685if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000686 maketables(1)