blob: c948312125b73b3d03e42a837554d742c8d5b3c4 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Fredrik Lundhcfcea492000-09-25 08:07:06 +000021#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000023#
24
25import sys
26
27SCRIPT = sys.argv[0]
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000028VERSION = "2.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000029
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000030# The Unicode Database
31UNIDATA_VERSION = "3.2.0"
Martin v. Löwis677bde22002-11-23 22:08:15 +000032UNICODE_DATA = "UnicodeData.txt"
33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000034
35CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
36 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
37 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
38 "So" ]
39
40BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
41 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
42 "ON" ]
43
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000044# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000045ALPHA_MASK = 0x01
46DECIMAL_MASK = 0x02
47DIGIT_MASK = 0x04
48LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000049LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000050SPACE_MASK = 0x20
51TITLE_MASK = 0x40
52UPPER_MASK = 0x80
53
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000054def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000055
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000056 print "--- Reading", UNICODE_DATA, "..."
57
Martin v. Löwis677bde22002-11-23 22:08:15 +000058 unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000059
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000060 print len(filter(None, unicode.table)), "characters"
61
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000062 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000063 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000064 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000065
66# --------------------------------------------------------------------
67# unicode character properties
68
69def makeunicodedata(unicode, trace):
70
Fredrik Lundhcfcea492000-09-25 08:07:06 +000071 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000072 table = [dummy]
73 cache = {0: dummy}
74 index = [0] * len(unicode.chars)
75
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000076 FILE = "Modules/unicodedata_db.h"
77
78 print "--- Preparing", FILE, "..."
79
Fredrik Lundhcfcea492000-09-25 08:07:06 +000080 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000081
Fredrik Lundhf367cac2000-09-24 23:18:31 +000082 for char in unicode.chars:
83 record = unicode.table[char]
84 if record:
85 # extract database properties
86 category = CATEGORY_NAMES.index(record[2])
87 combining = int(record[3])
88 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
89 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000090 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000091 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000092 )
93 # add entry to index and item tables
94 i = cache.get(item)
95 if i is None:
96 cache[item] = i = len(table)
97 table.append(item)
98 index[char] = i
99
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000100 # 2) decomposition data
101
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000102 decomp_data = [0]
103 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000104 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000105 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000106
Martin v. Löwis677bde22002-11-23 22:08:15 +0000107 comp_pairs = []
108 comp_first = [None] * len(unicode.chars)
109 comp_last = [None] * len(unicode.chars)
110
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000111 for char in unicode.chars:
112 record = unicode.table[char]
113 if record:
114 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000115 decomp = record[5].split()
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000116 # prefix
117 if decomp[0][0] == "<":
118 prefix = decomp.pop(0)
119 else:
120 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000121 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000122 i = decomp_prefix.index(prefix)
123 except ValueError:
124 i = len(decomp_prefix)
125 decomp_prefix.append(prefix)
126 prefix = i
127 assert prefix < 256
128 # content
129 decomp = [prefix + (len(decomp)<<8)] +\
130 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000131 # Collect NFC pairs
132 if not prefix and len(decomp) == 3 and \
133 char not in unicode.exclusions and \
134 unicode.table[decomp[1]][3] == "0":
135 p, l, r = decomp
136 comp_first[l] = 1
137 comp_last[r] = 1
138 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139 try:
140 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141 except ValueError:
142 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 decomp_data.extend(decomp)
144 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 else:
146 i = 0
147 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000148
Martin v. Löwis677bde22002-11-23 22:08:15 +0000149 f = l = 0
150 comp_first_ranges = []
151 comp_last_ranges = []
152 prev_f = prev_l = None
153 for i in unicode.chars:
154 if comp_first[i] is not None:
155 comp_first[i] = f
156 f += 1
157 if prev_f is None:
158 prev_f = (i,i)
159 elif prev_f[1]+1 == i:
160 prev_f = prev_f[0],i
161 else:
162 comp_first_ranges.append(prev_f)
163 prev_f = (i,i)
164 if comp_last[i] is not None:
165 comp_last[i] = l
166 l += 1
167 if prev_l is None:
168 prev_l = (i,i)
169 elif prev_l[1]+1 == i:
170 prev_l = prev_l[0],i
171 else:
172 comp_last_ranges.append(prev_l)
173 prev_l = (i,i)
174 comp_first_ranges.append(prev_f)
175 comp_last_ranges.append(prev_l)
176 total_first = f
177 total_last = l
178
179 comp_data = [0]*(total_first*total_last)
180 for f,l,char in comp_pairs:
181 f = comp_first[f]
182 l = comp_last[l]
183 comp_data[f*total_last+l] = char
184
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000185 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 print len(decomp_prefix), "unique decomposition prefixes"
187 print len(decomp_data), "unique decomposition entries:",
188 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000189 print total_first, "first characters in NFC"
190 print total_last, "last characters in NFC"
191 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000192
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000193 print "--- Writing", FILE, "..."
194
Fred Drake9c685052000-10-26 03:56:46 +0000195 fp = open(FILE, "w")
196 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
197 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000198 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000199 print >>fp, "/* a list of unique database records */"
200 print >>fp, \
201 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000202 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000203 print >>fp, " {%d, %d, %d, %d}," % item
204 print >>fp, "};"
205 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000206
Martin v. Löwis677bde22002-11-23 22:08:15 +0000207 print >>fp, "/* Reindexing of NFC first characters. */"
208 print >>fp, "#define TOTAL_FIRST",total_first
209 print >>fp, "#define TOTAL_LAST",total_last
210 print >>fp, "struct reindex{int start;short count,index;};"
211 print >>fp, "struct reindex nfc_first[] = {"
212 for start,end in comp_first_ranges:
213 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
214 print >>fp," {0,0,0}"
215 print >>fp,"};\n"
216 print >>fp, "struct reindex nfc_last[] = {"
217 for start,end in comp_last_ranges:
218 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
219 print >>fp," {0,0,0}"
220 print >>fp,"};\n"
221
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000222 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000223 # the support code moved into unicodedatabase.c
224
Fred Drake9c685052000-10-26 03:56:46 +0000225 print >>fp, "/* string literals */"
226 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000227 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000228 print >>fp, " \"%s\"," % name
229 print >>fp, " NULL"
230 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000231
Fred Drake9c685052000-10-26 03:56:46 +0000232 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000234 print >>fp, " \"%s\"," % name
235 print >>fp, " NULL"
236 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000238 print >>fp, "static const char *decomp_prefix[] = {"
239 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000240 print >>fp, " \"%s\"," % name
241 print >>fp, " NULL"
242 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000243
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000244 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000245 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000246
Fred Drake9c685052000-10-26 03:56:46 +0000247 print >>fp, "/* index tables for the database records */"
248 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249 Array("index1", index1).dump(fp, trace)
250 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000251
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000252 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000253 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000254
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000255 print >>fp, "/* decomposition data */"
256 Array("decomp_data", decomp_data).dump(fp, trace)
257
Fred Drake9c685052000-10-26 03:56:46 +0000258 print >>fp, "/* index tables for the decomposition data */"
259 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000260 Array("decomp_index1", index1).dump(fp, trace)
261 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262
Martin v. Löwis677bde22002-11-23 22:08:15 +0000263 index, index2, shift = splitbins(comp_data, trace)
264 print >>fp, "/* NFC pairs */"
265 print >>fp, "#define COMP_SHIFT", shift
266 Array("comp_index", index).dump(fp, trace)
267 Array("comp_data", index2).dump(fp, trace)
268
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000269 fp.close()
270
271# --------------------------------------------------------------------
272# unicode character type tables
273
274def makeunicodetype(unicode, trace):
275
276 FILE = "Objects/unicodetype_db.h"
277
278 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000279
280 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000281 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000282 table = [dummy]
283 cache = {0: dummy}
284 index = [0] * len(unicode.chars)
285
286 for char in unicode.chars:
287 record = unicode.table[char]
288 if record:
289 # extract database properties
290 category = record[2]
291 bidirectional = record[4]
292 flags = 0
293 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
294 flags |= ALPHA_MASK
295 if category == "Ll":
296 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000297 if category == "Zl" or bidirectional == "B":
298 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000299 if category == "Zs" or bidirectional in ("WS", "B", "S"):
300 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000301 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000302 flags |= TITLE_MASK
303 if category == "Lu":
304 flags |= UPPER_MASK
305 # use delta predictor for upper/lower/title
306 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000307 upper = int(record[12], 16) - char
308 assert -32768 <= upper <= 32767
309 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000310 else:
311 upper = 0
312 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000313 lower = int(record[13], 16) - char
314 assert -32768 <= lower <= 32767
315 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000316 else:
317 lower = 0
318 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000319 title = int(record[14], 16) - char
320 assert -32768 <= lower <= 32767
321 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000322 else:
323 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000324 # decimal digit, integer digit
325 decimal = 0
326 if record[6]:
327 flags |= DECIMAL_MASK
328 decimal = int(record[6])
329 digit = 0
330 if record[7]:
331 flags |= DIGIT_MASK
332 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000333 item = (
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000334 flags, upper, lower, title, decimal, digit
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000335 )
336 # add entry to index and item tables
337 i = cache.get(item)
338 if i is None:
339 cache[item] = i = len(table)
340 table.append(item)
341 index[char] = i
342
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000343 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000344
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000345 print "--- Writing", FILE, "..."
346
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000347 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000348 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
349 print >>fp
350 print >>fp, "/* a list of unique character type descriptors */"
351 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000353 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
354 print >>fp, "};"
355 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000356
357 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000358 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000359
Fred Drake9c685052000-10-26 03:56:46 +0000360 print >>fp, "/* type indexes */"
361 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000362 Array("index1", index1).dump(fp, trace)
363 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000365 fp.close()
366
367# --------------------------------------------------------------------
368# unicode name database
369
370def makeunicodename(unicode, trace):
371
372 FILE = "Modules/unicodename_db.h"
373
374 print "--- Preparing", FILE, "..."
375
376 # collect names
377 names = [None] * len(unicode.chars)
378
379 for char in unicode.chars:
380 record = unicode.table[char]
381 if record:
382 name = record[1].strip()
383 if name and name[0] != "<":
384 names[char] = name + chr(0)
385
386 print len(filter(lambda n: n is not None, names)), "distinct names"
387
388 # collect unique words from names (note that we differ between
389 # words inside a sentence, and words ending a sentence. the
390 # latter includes the trailing null byte.
391
392 words = {}
393 n = b = 0
394 for char in unicode.chars:
395 name = names[char]
396 if name:
397 w = name.split()
398 b = b + len(name)
399 n = n + len(w)
400 for w in w:
401 l = words.get(w)
402 if l:
403 l.append(None)
404 else:
405 words[w] = [len(words)]
406
407 print n, "words in text;", b, "bytes"
408
409 wordlist = words.items()
410
Martin v. Löwis97225da2002-11-24 23:05:09 +0000411 # sort on falling frequency, then by name
412 def cmpwords((aword, alist),(bword, blist)):
413 r = -cmp(len(alist),len(blist))
414 if r:
415 return r
416 return cmp(aword, bword)
417 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000418
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000419 # figure out how many phrasebook escapes we need
420 escapes = 0
421 while escapes * 256 < len(wordlist):
422 escapes = escapes + 1
423 print escapes, "escapes"
424
425 short = 256 - escapes
426
427 assert short > 0
428
429 print short, "short indexes in lexicon"
430
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000431 # statistics
432 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000434 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000436
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000437 # pick the most commonly used words, and sort the rest on falling
438 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000439
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000440 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000441 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
442 wordlist.extend(wordtail)
443
444 # generate lexicon from words
445
446 lexicon_offset = [0]
447 lexicon = ""
448 words = {}
449
450 # build a lexicon string
451 offset = 0
452 for w, x in wordlist:
453 # encoding: bit 7 indicates last character in word (chr(128)
454 # indicates the last character in an entire string)
455 ww = w[:-1] + chr(ord(w[-1])+128)
456 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000457 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458 if o < 0:
459 o = offset
460 lexicon = lexicon + ww
461 offset = offset + len(w)
462 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000463 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000464
465 lexicon = map(ord, lexicon)
466
467 # generate phrasebook from names and lexicon
468 phrasebook = [0]
469 phrasebook_offset = [0] * len(unicode.chars)
470 for char in unicode.chars:
471 name = names[char]
472 if name:
473 w = name.split()
474 phrasebook_offset[char] = len(phrasebook)
475 for w in w:
476 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000477 if i < short:
478 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000479 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000480 # store as two bytes
481 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000482 phrasebook.append(i&255)
483
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000484 assert getsize(phrasebook) == 1
485
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000486 #
487 # unicode name hash table
488
489 # extract names
490 data = []
491 for char in unicode.chars:
492 record = unicode.table[char]
493 if record:
494 name = record[1].strip()
495 if name and name[0] != "<":
496 data.append((name, char))
497
498 # the magic number 47 was chosen to minimize the number of
499 # collisions on the current data set. if you like, change it
500 # and see what happens...
501
502 codehash = Hash("code", data, 47)
503
504 print "--- Writing", FILE, "..."
505
506 fp = open(FILE, "w")
507 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
508 print >>fp
509 print >>fp, "#define NAME_MAXLEN", 256
510 print >>fp
511 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000512 Array("lexicon", lexicon).dump(fp, trace)
513 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000514
515 # split decomposition index table
516 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
517
518 print >>fp, "/* code->name phrasebook */"
519 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000520 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000521
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000522 Array("phrasebook", phrasebook).dump(fp, trace)
523 Array("phrasebook_offset1", offset1).dump(fp, trace)
524 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000525
526 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000527 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000528
529 fp.close()
530
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000531# --------------------------------------------------------------------
532# the following support code is taken from the unidb utilities
533# Copyright (c) 1999-2000 by Secret Labs AB
534
535# load a unicode-data file from disk
536
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000537import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000538
539class UnicodeData:
540
Martin v. Löwis677bde22002-11-23 22:08:15 +0000541 def __init__(self, filename, exclusions, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000542 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000543 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000544 while 1:
545 s = file.readline()
546 if not s:
547 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000548 s = s.strip().split(";")
549 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000550 table[char] = s
551
Martin v. Löwis97225da2002-11-24 23:05:09 +0000552 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000553 if expand:
554 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000555 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000556 s = table[i]
557 if s:
558 if s[1][-6:] == "First>":
559 s[1] = ""
560 field = s[:]
561 elif s[1][-5:] == "Last>":
562 s[1] = ""
563 field = None
564 elif field:
565 field[0] = hex(i)
566 table[i] = field
567
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000568 # public attributes
569 self.filename = filename
570 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000571 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000572
Martin v. Löwis677bde22002-11-23 22:08:15 +0000573 file = open(exclusions)
574 self.exclusions = {}
575 for s in file:
576 s = s.strip()
577 if not s:
578 continue
579 if s[0] == '#':
580 continue
581 char = int(s.split()[0],16)
582 self.exclusions[char] = 1
583
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000584 def uselatin1(self):
585 # restrict character range to ISO Latin 1
586 self.chars = range(256)
587
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000588# hash table tools
589
590# this is a straight-forward reimplementation of Python's built-in
591# dictionary type, using a static data structure, and a custom string
592# hash algorithm.
593
594def myhash(s, magic):
595 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000596 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000597 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000598 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000599 if ix:
600 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
601 return h
602
603SIZES = [
604 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
605 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
606 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
607 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
608]
609
610class Hash:
611 def __init__(self, name, data, magic):
612 # turn a (key, value) list into a static hash table structure
613
614 # determine table size
615 for size, poly in SIZES:
616 if size > len(data):
617 poly = size + poly
618 break
619 else:
620 raise AssertionError, "ran out of polynominals"
621
622 print size, "slots in hash table"
623
624 table = [None] * size
625
626 mask = size-1
627
628 n = 0
629
630 hash = myhash
631
632 # initialize hash table
633 for key, value in data:
634 h = hash(key, magic)
635 i = (~h) & mask
636 v = table[i]
637 if v is None:
638 table[i] = value
639 continue
640 incr = (h ^ (h >> 3)) & mask;
641 if not incr:
642 incr = mask
643 while 1:
644 n = n + 1
645 i = (i + incr) & mask
646 v = table[i]
647 if v is None:
648 table[i] = value
649 break
650 incr = incr << 1
651 if incr > mask:
652 incr = incr ^ poly
653
654 print n, "collisions"
655 self.collisions = n
656
657 for i in range(len(table)):
658 if table[i] is None:
659 table[i] = 0
660
661 self.data = Array(name + "_hash", table)
662 self.magic = magic
663 self.name = name
664 self.size = size
665 self.poly = poly
666
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000668 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000669 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 file.write("#define %s_magic %d\n" % (self.name, self.magic))
671 file.write("#define %s_size %d\n" % (self.name, self.size))
672 file.write("#define %s_poly %d\n" % (self.name, self.poly))
673
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000674# stuff to deal with arrays of unsigned integers
675
676class Array:
677
678 def __init__(self, name, data):
679 self.name = name
680 self.data = data
681
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000682 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000683 # write data to file, as a C array
684 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000685 if trace:
686 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000687 file.write("static ")
688 if size == 1:
689 file.write("unsigned char")
690 elif size == 2:
691 file.write("unsigned short")
692 else:
693 file.write("unsigned int")
694 file.write(" " + self.name + "[] = {\n")
695 if self.data:
696 s = " "
697 for item in self.data:
698 i = str(item) + ", "
699 if len(s) + len(i) > 78:
700 file.write(s + "\n")
701 s = " " + i
702 else:
703 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000704 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000705 file.write(s + "\n")
706 file.write("};\n\n")
707
708def getsize(data):
709 # return smallest possible integer size for the given array
710 maxdata = max(data)
711 if maxdata < 256:
712 return 1
713 elif maxdata < 65536:
714 return 2
715 else:
716 return 4
717
Tim Peters21013482000-09-25 07:13:41 +0000718def splitbins(t, trace=0):
719 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
720
721 t is a sequence of ints. This function can be useful to save space if
722 many of the ints are the same. t1 and t2 are lists of ints, and shift
723 is an int, chosen to minimize the combined size of t1 and t2 (in C
724 code), and where for each i in range(len(t)),
725 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
726 where mask is a bitmask isolating the last "shift" bits.
727
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000728 If optional arg trace is non-zero (default zero), progress info
729 is printed to sys.stderr. The higher the value, the more info
730 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000731 """
732
733 import sys
734 if trace:
735 def dump(t1, t2, shift, bytes):
736 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
737 len(t1), len(t2), shift, bytes)
738 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
739 "bytes"
740 n = len(t)-1 # last valid index
741 maxshift = 0 # the most we can shift n and still have something left
742 if n > 0:
743 while n >> 1:
744 n >>= 1
745 maxshift += 1
746 del n
747 bytes = sys.maxint # smallest total size so far
748 t = tuple(t) # so slices can be dict keys
749 for shift in range(maxshift + 1):
750 t1 = []
751 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000752 size = 2**shift
753 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000754 for i in range(0, len(t), size):
755 bin = t[i:i+size]
756 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000757 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000758 index = len(t2)
759 bincache[bin] = index
760 t2.extend(bin)
761 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000762 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000763 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000764 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000765 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000766 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000767 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000768 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000769 t1, t2, shift = best
770 if trace:
771 print >>sys.stderr, "Best:",
772 dump(t1, t2, shift, bytes)
773 if __debug__:
774 # exhaustively verify that the decomposition is correct
775 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
776 for i in xrange(len(t)):
777 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
778 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000779
780if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000781 maketables(1)