blob: 42cbcf149e570f5e9bb4704301cc4ceb0ed32e69 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.0 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Fredrik Lundhcfcea492000-09-25 08:07:06 +000019#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000020# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000021#
22
23import sys
24
25SCRIPT = sys.argv[0]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000026VERSION = "2.1"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000027
Martin v. Löwis677bde22002-11-23 22:08:15 +000028UNICODE_DATA = "UnicodeData.txt"
29COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000030
31CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
32 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
33 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
34 "So" ]
35
36BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
37 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
38 "ON" ]
39
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000040# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000041ALPHA_MASK = 0x01
42DECIMAL_MASK = 0x02
43DIGIT_MASK = 0x04
44LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000045LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000046SPACE_MASK = 0x20
47TITLE_MASK = 0x40
48UPPER_MASK = 0x80
49
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000050def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000051
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000052 print "--- Reading", UNICODE_DATA, "..."
53
Martin v. Löwis677bde22002-11-23 22:08:15 +000054 unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000055
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000056 print len(filter(None, unicode.table)), "characters"
57
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000058 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000059 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000060 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000061
62# --------------------------------------------------------------------
63# unicode character properties
64
65def makeunicodedata(unicode, trace):
66
Fredrik Lundhcfcea492000-09-25 08:07:06 +000067 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068 table = [dummy]
69 cache = {0: dummy}
70 index = [0] * len(unicode.chars)
71
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000072 FILE = "Modules/unicodedata_db.h"
73
74 print "--- Preparing", FILE, "..."
75
Fredrik Lundhcfcea492000-09-25 08:07:06 +000076 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000077
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078 for char in unicode.chars:
79 record = unicode.table[char]
80 if record:
81 # extract database properties
82 category = CATEGORY_NAMES.index(record[2])
83 combining = int(record[3])
84 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
85 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000086 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000087 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000088 )
89 # add entry to index and item tables
90 i = cache.get(item)
91 if i is None:
92 cache[item] = i = len(table)
93 table.append(item)
94 index[char] = i
95
Fredrik Lundhcfcea492000-09-25 08:07:06 +000096 # 2) decomposition data
97
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000098 decomp_data = [0]
99 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000100 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000101 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000102
Martin v. Löwis677bde22002-11-23 22:08:15 +0000103 comp_pairs = []
104 comp_first = [None] * len(unicode.chars)
105 comp_last = [None] * len(unicode.chars)
106
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000111 decomp = record[5].split()
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000112 # prefix
113 if decomp[0][0] == "<":
114 prefix = decomp.pop(0)
115 else:
116 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000117 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118 i = decomp_prefix.index(prefix)
119 except ValueError:
120 i = len(decomp_prefix)
121 decomp_prefix.append(prefix)
122 prefix = i
123 assert prefix < 256
124 # content
125 decomp = [prefix + (len(decomp)<<8)] +\
126 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000127 # Collect NFC pairs
128 if not prefix and len(decomp) == 3 and \
129 char not in unicode.exclusions and \
130 unicode.table[decomp[1]][3] == "0":
131 p, l, r = decomp
132 comp_first[l] = 1
133 comp_last[r] = 1
134 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000135 try:
136 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 except ValueError:
138 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139 decomp_data.extend(decomp)
140 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141 else:
142 i = 0
143 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000144
Martin v. Löwis677bde22002-11-23 22:08:15 +0000145 f = l = 0
146 comp_first_ranges = []
147 comp_last_ranges = []
148 prev_f = prev_l = None
149 for i in unicode.chars:
150 if comp_first[i] is not None:
151 comp_first[i] = f
152 f += 1
153 if prev_f is None:
154 prev_f = (i,i)
155 elif prev_f[1]+1 == i:
156 prev_f = prev_f[0],i
157 else:
158 comp_first_ranges.append(prev_f)
159 prev_f = (i,i)
160 if comp_last[i] is not None:
161 comp_last[i] = l
162 l += 1
163 if prev_l is None:
164 prev_l = (i,i)
165 elif prev_l[1]+1 == i:
166 prev_l = prev_l[0],i
167 else:
168 comp_last_ranges.append(prev_l)
169 prev_l = (i,i)
170 comp_first_ranges.append(prev_f)
171 comp_last_ranges.append(prev_l)
172 total_first = f
173 total_last = l
174
175 comp_data = [0]*(total_first*total_last)
176 for f,l,char in comp_pairs:
177 f = comp_first[f]
178 l = comp_last[l]
179 comp_data[f*total_last+l] = char
180
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000181 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 print len(decomp_prefix), "unique decomposition prefixes"
183 print len(decomp_data), "unique decomposition entries:",
184 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000185 print total_first, "first characters in NFC"
186 print total_last, "last characters in NFC"
187 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000188
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000189 print "--- Writing", FILE, "..."
190
Fred Drake9c685052000-10-26 03:56:46 +0000191 fp = open(FILE, "w")
192 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
193 print >>fp
194 print >>fp, "/* a list of unique database records */"
195 print >>fp, \
196 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000197 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000198 print >>fp, " {%d, %d, %d, %d}," % item
199 print >>fp, "};"
200 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000201
Martin v. Löwis677bde22002-11-23 22:08:15 +0000202 print >>fp, "/* Reindexing of NFC first characters. */"
203 print >>fp, "#define TOTAL_FIRST",total_first
204 print >>fp, "#define TOTAL_LAST",total_last
205 print >>fp, "struct reindex{int start;short count,index;};"
206 print >>fp, "struct reindex nfc_first[] = {"
207 for start,end in comp_first_ranges:
208 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
209 print >>fp," {0,0,0}"
210 print >>fp,"};\n"
211 print >>fp, "struct reindex nfc_last[] = {"
212 for start,end in comp_last_ranges:
213 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
214 print >>fp," {0,0,0}"
215 print >>fp,"};\n"
216
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000217 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000218 # the support code moved into unicodedatabase.c
219
Fred Drake9c685052000-10-26 03:56:46 +0000220 print >>fp, "/* string literals */"
221 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000222 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000223 print >>fp, " \"%s\"," % name
224 print >>fp, " NULL"
225 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000226
Fred Drake9c685052000-10-26 03:56:46 +0000227 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000228 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000229 print >>fp, " \"%s\"," % name
230 print >>fp, " NULL"
231 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000233 print >>fp, "static const char *decomp_prefix[] = {"
234 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000235 print >>fp, " \"%s\"," % name
236 print >>fp, " NULL"
237 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000238
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000239 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000240 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241
Fred Drake9c685052000-10-26 03:56:46 +0000242 print >>fp, "/* index tables for the database records */"
243 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000244 Array("index1", index1).dump(fp, trace)
245 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000246
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000247 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000248 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000249
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000250 print >>fp, "/* decomposition data */"
251 Array("decomp_data", decomp_data).dump(fp, trace)
252
Fred Drake9c685052000-10-26 03:56:46 +0000253 print >>fp, "/* index tables for the decomposition data */"
254 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000255 Array("decomp_index1", index1).dump(fp, trace)
256 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Martin v. Löwis677bde22002-11-23 22:08:15 +0000258 index, index2, shift = splitbins(comp_data, trace)
259 print >>fp, "/* NFC pairs */"
260 print >>fp, "#define COMP_SHIFT", shift
261 Array("comp_index", index).dump(fp, trace)
262 Array("comp_data", index2).dump(fp, trace)
263
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000264 fp.close()
265
266# --------------------------------------------------------------------
267# unicode character type tables
268
269def makeunicodetype(unicode, trace):
270
271 FILE = "Objects/unicodetype_db.h"
272
273 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000274
275 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000276 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000277 table = [dummy]
278 cache = {0: dummy}
279 index = [0] * len(unicode.chars)
280
281 for char in unicode.chars:
282 record = unicode.table[char]
283 if record:
284 # extract database properties
285 category = record[2]
286 bidirectional = record[4]
287 flags = 0
288 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
289 flags |= ALPHA_MASK
290 if category == "Ll":
291 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000292 if category == "Zl" or bidirectional == "B":
293 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000294 if category == "Zs" or bidirectional in ("WS", "B", "S"):
295 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000296 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000297 flags |= TITLE_MASK
298 if category == "Lu":
299 flags |= UPPER_MASK
300 # use delta predictor for upper/lower/title
301 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000302 upper = int(record[12], 16) - char
303 assert -32768 <= upper <= 32767
304 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000305 else:
306 upper = 0
307 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000308 lower = int(record[13], 16) - char
309 assert -32768 <= lower <= 32767
310 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000311 else:
312 lower = 0
313 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000314 title = int(record[14], 16) - char
315 assert -32768 <= lower <= 32767
316 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000317 else:
318 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000319 # decimal digit, integer digit
320 decimal = 0
321 if record[6]:
322 flags |= DECIMAL_MASK
323 decimal = int(record[6])
324 digit = 0
325 if record[7]:
326 flags |= DIGIT_MASK
327 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000328 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000329 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000330 )
331 # add entry to index and item tables
332 i = cache.get(item)
333 if i is None:
334 cache[item] = i = len(table)
335 table.append(item)
336 index[char] = i
337
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000338 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000339
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000340 print "--- Writing", FILE, "..."
341
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000342 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000343 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
344 print >>fp
345 print >>fp, "/* a list of unique character type descriptors */"
346 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000347 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000348 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
349 print >>fp, "};"
350 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000351
352 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000353 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000354
Fred Drake9c685052000-10-26 03:56:46 +0000355 print >>fp, "/* type indexes */"
356 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000357 Array("index1", index1).dump(fp, trace)
358 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000359
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000360 fp.close()
361
362# --------------------------------------------------------------------
363# unicode name database
364
365def makeunicodename(unicode, trace):
366
367 FILE = "Modules/unicodename_db.h"
368
369 print "--- Preparing", FILE, "..."
370
371 # collect names
372 names = [None] * len(unicode.chars)
373
374 for char in unicode.chars:
375 record = unicode.table[char]
376 if record:
377 name = record[1].strip()
378 if name and name[0] != "<":
379 names[char] = name + chr(0)
380
381 print len(filter(lambda n: n is not None, names)), "distinct names"
382
383 # collect unique words from names (note that we differ between
384 # words inside a sentence, and words ending a sentence. the
385 # latter includes the trailing null byte.
386
387 words = {}
388 n = b = 0
389 for char in unicode.chars:
390 name = names[char]
391 if name:
392 w = name.split()
393 b = b + len(name)
394 n = n + len(w)
395 for w in w:
396 l = words.get(w)
397 if l:
398 l.append(None)
399 else:
400 words[w] = [len(words)]
401
402 print n, "words in text;", b, "bytes"
403
404 wordlist = words.items()
405
406 # sort on falling frequency
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000407 # XXX: different Python versions produce a different order
408 # for words with equal frequency
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000409 wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
410
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411 # figure out how many phrasebook escapes we need
412 escapes = 0
413 while escapes * 256 < len(wordlist):
414 escapes = escapes + 1
415 print escapes, "escapes"
416
417 short = 256 - escapes
418
419 assert short > 0
420
421 print short, "short indexes in lexicon"
422
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000423 # statistics
424 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000426 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000427 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000428
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000429 # pick the most commonly used words, and sort the rest on falling
430 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000431
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000433 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
434 wordlist.extend(wordtail)
435
436 # generate lexicon from words
437
438 lexicon_offset = [0]
439 lexicon = ""
440 words = {}
441
442 # build a lexicon string
443 offset = 0
444 for w, x in wordlist:
445 # encoding: bit 7 indicates last character in word (chr(128)
446 # indicates the last character in an entire string)
447 ww = w[:-1] + chr(ord(w[-1])+128)
448 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000449 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000450 if o < 0:
451 o = offset
452 lexicon = lexicon + ww
453 offset = offset + len(w)
454 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000456
457 lexicon = map(ord, lexicon)
458
459 # generate phrasebook from names and lexicon
460 phrasebook = [0]
461 phrasebook_offset = [0] * len(unicode.chars)
462 for char in unicode.chars:
463 name = names[char]
464 if name:
465 w = name.split()
466 phrasebook_offset[char] = len(phrasebook)
467 for w in w:
468 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000469 if i < short:
470 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000471 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000472 # store as two bytes
473 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000474 phrasebook.append(i&255)
475
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000476 assert getsize(phrasebook) == 1
477
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000478 #
479 # unicode name hash table
480
481 # extract names
482 data = []
483 for char in unicode.chars:
484 record = unicode.table[char]
485 if record:
486 name = record[1].strip()
487 if name and name[0] != "<":
488 data.append((name, char))
489
490 # the magic number 47 was chosen to minimize the number of
491 # collisions on the current data set. if you like, change it
492 # and see what happens...
493
494 codehash = Hash("code", data, 47)
495
496 print "--- Writing", FILE, "..."
497
498 fp = open(FILE, "w")
499 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
500 print >>fp
501 print >>fp, "#define NAME_MAXLEN", 256
502 print >>fp
503 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000504 Array("lexicon", lexicon).dump(fp, trace)
505 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000506
507 # split decomposition index table
508 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
509
510 print >>fp, "/* code->name phrasebook */"
511 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000512 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000513
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000514 Array("phrasebook", phrasebook).dump(fp, trace)
515 Array("phrasebook_offset1", offset1).dump(fp, trace)
516 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000517
518 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000519 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000520
521 fp.close()
522
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000523# --------------------------------------------------------------------
524# the following support code is taken from the unidb utilities
525# Copyright (c) 1999-2000 by Secret Labs AB
526
527# load a unicode-data file from disk
528
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000529import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000530
531class UnicodeData:
532
Martin v. Löwis677bde22002-11-23 22:08:15 +0000533 def __init__(self, filename, exclusions, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000534 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000535 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000536 while 1:
537 s = file.readline()
538 if not s:
539 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000540 s = s.strip().split(";")
541 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000542 table[char] = s
543
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000544 # expand first-last ranges (ignore surrogates and private use)
545 if expand:
546 field = None
547 for i in range(0, 0xD800):
548 s = table[i]
549 if s:
550 if s[1][-6:] == "First>":
551 s[1] = ""
552 field = s[:]
553 elif s[1][-5:] == "Last>":
554 s[1] = ""
555 field = None
556 elif field:
557 field[0] = hex(i)
558 table[i] = field
559
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000560 # public attributes
561 self.filename = filename
562 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000563 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000564
Martin v. Löwis677bde22002-11-23 22:08:15 +0000565 file = open(exclusions)
566 self.exclusions = {}
567 for s in file:
568 s = s.strip()
569 if not s:
570 continue
571 if s[0] == '#':
572 continue
573 char = int(s.split()[0],16)
574 self.exclusions[char] = 1
575
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000576 def uselatin1(self):
577 # restrict character range to ISO Latin 1
578 self.chars = range(256)
579
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000580# hash table tools
581
582# this is a straight-forward reimplementation of Python's built-in
583# dictionary type, using a static data structure, and a custom string
584# hash algorithm.
585
586def myhash(s, magic):
587 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000588 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589 h = (h * magic) + c
590 ix = h & 0xff000000
591 if ix:
592 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
593 return h
594
595SIZES = [
596 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
597 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
598 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
599 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
600]
601
602class Hash:
603 def __init__(self, name, data, magic):
604 # turn a (key, value) list into a static hash table structure
605
606 # determine table size
607 for size, poly in SIZES:
608 if size > len(data):
609 poly = size + poly
610 break
611 else:
612 raise AssertionError, "ran out of polynominals"
613
614 print size, "slots in hash table"
615
616 table = [None] * size
617
618 mask = size-1
619
620 n = 0
621
622 hash = myhash
623
624 # initialize hash table
625 for key, value in data:
626 h = hash(key, magic)
627 i = (~h) & mask
628 v = table[i]
629 if v is None:
630 table[i] = value
631 continue
632 incr = (h ^ (h >> 3)) & mask;
633 if not incr:
634 incr = mask
635 while 1:
636 n = n + 1
637 i = (i + incr) & mask
638 v = table[i]
639 if v is None:
640 table[i] = value
641 break
642 incr = incr << 1
643 if incr > mask:
644 incr = incr ^ poly
645
646 print n, "collisions"
647 self.collisions = n
648
649 for i in range(len(table)):
650 if table[i] is None:
651 table[i] = 0
652
653 self.data = Array(name + "_hash", table)
654 self.magic = magic
655 self.name = name
656 self.size = size
657 self.poly = poly
658
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 file.write("#define %s_magic %d\n" % (self.name, self.magic))
663 file.write("#define %s_size %d\n" % (self.name, self.size))
664 file.write("#define %s_poly %d\n" % (self.name, self.poly))
665
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000666# stuff to deal with arrays of unsigned integers
667
668class Array:
669
670 def __init__(self, name, data):
671 self.name = name
672 self.data = data
673
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000674 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000675 # write data to file, as a C array
676 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000677 if trace:
678 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000679 file.write("static ")
680 if size == 1:
681 file.write("unsigned char")
682 elif size == 2:
683 file.write("unsigned short")
684 else:
685 file.write("unsigned int")
686 file.write(" " + self.name + "[] = {\n")
687 if self.data:
688 s = " "
689 for item in self.data:
690 i = str(item) + ", "
691 if len(s) + len(i) > 78:
692 file.write(s + "\n")
693 s = " " + i
694 else:
695 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000696 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000697 file.write(s + "\n")
698 file.write("};\n\n")
699
700def getsize(data):
701 # return smallest possible integer size for the given array
702 maxdata = max(data)
703 if maxdata < 256:
704 return 1
705 elif maxdata < 65536:
706 return 2
707 else:
708 return 4
709
Tim Peters21013482000-09-25 07:13:41 +0000710def splitbins(t, trace=0):
711 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
712
713 t is a sequence of ints. This function can be useful to save space if
714 many of the ints are the same. t1 and t2 are lists of ints, and shift
715 is an int, chosen to minimize the combined size of t1 and t2 (in C
716 code), and where for each i in range(len(t)),
717 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
718 where mask is a bitmask isolating the last "shift" bits.
719
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000720 If optional arg trace is non-zero (default zero), progress info
721 is printed to sys.stderr. The higher the value, the more info
722 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000723 """
724
725 import sys
726 if trace:
727 def dump(t1, t2, shift, bytes):
728 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
729 len(t1), len(t2), shift, bytes)
730 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
731 "bytes"
732 n = len(t)-1 # last valid index
733 maxshift = 0 # the most we can shift n and still have something left
734 if n > 0:
735 while n >> 1:
736 n >>= 1
737 maxshift += 1
738 del n
739 bytes = sys.maxint # smallest total size so far
740 t = tuple(t) # so slices can be dict keys
741 for shift in range(maxshift + 1):
742 t1 = []
743 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000744 size = 2**shift
745 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000746 for i in range(0, len(t), size):
747 bin = t[i:i+size]
748 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000749 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000750 index = len(t2)
751 bincache[bin] = index
752 t2.extend(bin)
753 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000754 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000755 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000756 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000757 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000758 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000759 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000760 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000761 t1, t2, shift = best
762 if trace:
763 print >>sys.stderr, "Best:",
764 dump(t1, t2, shift, bytes)
765 if __debug__:
766 # exhaustively verify that the decomposition is correct
767 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
768 for i in xrange(len(t)):
769 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
770 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000771
772if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000773 maketables(1)