blob: 6c29fd189a298a5000c9aaaace987793f9ff1457 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Fredrik Lundhcfcea492000-09-25 08:07:06 +000022#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000023# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000024#
25
26import sys
27
28SCRIPT = sys.argv[0]
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000029VERSION = "2.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000030
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000031# The Unicode Database
32UNIDATA_VERSION = "3.2.0"
Martin v. Löwis677bde22002-11-23 22:08:15 +000033UNICODE_DATA = "UnicodeData.txt"
34COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000035EASTASIAN_WIDTH = "EastAsianWidth.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
38 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
39 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
40 "So" ]
41
42BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
43 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
44 "ON" ]
45
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000046# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000047ALPHA_MASK = 0x01
48DECIMAL_MASK = 0x02
49DIGIT_MASK = 0x04
50LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000051LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000052SPACE_MASK = 0x20
53TITLE_MASK = 0x40
54UPPER_MASK = 0x80
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000055WIDE_MASK = 0x100
Fredrik Lundhe9133f72000-09-25 17:59:57 +000056
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000057def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000058
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000059 print "--- Reading", UNICODE_DATA, "..."
60
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000061 unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
62 EASTASIAN_WIDTH)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000064 print len(filter(None, unicode.table)), "characters"
65
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000066 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000067 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000068 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000069
70# --------------------------------------------------------------------
71# unicode character properties
72
73def makeunicodedata(unicode, trace):
74
Fredrik Lundhcfcea492000-09-25 08:07:06 +000075 dummy = (0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000076 table = [dummy]
77 cache = {0: dummy}
78 index = [0] * len(unicode.chars)
79
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000080 FILE = "Modules/unicodedata_db.h"
81
82 print "--- Preparing", FILE, "..."
83
Fredrik Lundhcfcea492000-09-25 08:07:06 +000084 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000085
Fredrik Lundhf367cac2000-09-24 23:18:31 +000086 for char in unicode.chars:
87 record = unicode.table[char]
88 if record:
89 # extract database properties
90 category = CATEGORY_NAMES.index(record[2])
91 combining = int(record[3])
92 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
93 mirrored = record[9] == "Y"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000094 item = (
Fredrik Lundhcfcea492000-09-25 08:07:06 +000095 category, combining, bidirectional, mirrored
Fredrik Lundhf367cac2000-09-24 23:18:31 +000096 )
97 # add entry to index and item tables
98 i = cache.get(item)
99 if i is None:
100 cache[item] = i = len(table)
101 table.append(item)
102 index[char] = i
103
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000104 # 2) decomposition data
105
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000106 decomp_data = [0]
107 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000108 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000109 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000110
Martin v. Löwis677bde22002-11-23 22:08:15 +0000111 comp_pairs = []
112 comp_first = [None] * len(unicode.chars)
113 comp_last = [None] * len(unicode.chars)
114
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000115 for char in unicode.chars:
116 record = unicode.table[char]
117 if record:
118 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000119 decomp = record[5].split()
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000120 # prefix
121 if decomp[0][0] == "<":
122 prefix = decomp.pop(0)
123 else:
124 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000125 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 i = decomp_prefix.index(prefix)
127 except ValueError:
128 i = len(decomp_prefix)
129 decomp_prefix.append(prefix)
130 prefix = i
131 assert prefix < 256
132 # content
133 decomp = [prefix + (len(decomp)<<8)] +\
134 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000135 # Collect NFC pairs
136 if not prefix and len(decomp) == 3 and \
137 char not in unicode.exclusions and \
138 unicode.table[decomp[1]][3] == "0":
139 p, l, r = decomp
140 comp_first[l] = 1
141 comp_last[r] = 1
142 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 try:
144 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145 except ValueError:
146 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000147 decomp_data.extend(decomp)
148 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149 else:
150 i = 0
151 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152
Martin v. Löwis677bde22002-11-23 22:08:15 +0000153 f = l = 0
154 comp_first_ranges = []
155 comp_last_ranges = []
156 prev_f = prev_l = None
157 for i in unicode.chars:
158 if comp_first[i] is not None:
159 comp_first[i] = f
160 f += 1
161 if prev_f is None:
162 prev_f = (i,i)
163 elif prev_f[1]+1 == i:
164 prev_f = prev_f[0],i
165 else:
166 comp_first_ranges.append(prev_f)
167 prev_f = (i,i)
168 if comp_last[i] is not None:
169 comp_last[i] = l
170 l += 1
171 if prev_l is None:
172 prev_l = (i,i)
173 elif prev_l[1]+1 == i:
174 prev_l = prev_l[0],i
175 else:
176 comp_last_ranges.append(prev_l)
177 prev_l = (i,i)
178 comp_first_ranges.append(prev_f)
179 comp_last_ranges.append(prev_l)
180 total_first = f
181 total_last = l
182
183 comp_data = [0]*(total_first*total_last)
184 for f,l,char in comp_pairs:
185 f = comp_first[f]
186 l = comp_last[l]
187 comp_data[f*total_last+l] = char
188
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000189 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000190 print len(decomp_prefix), "unique decomposition prefixes"
191 print len(decomp_data), "unique decomposition entries:",
192 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000193 print total_first, "first characters in NFC"
194 print total_last, "last characters in NFC"
195 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000196
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000197 print "--- Writing", FILE, "..."
198
Fred Drake9c685052000-10-26 03:56:46 +0000199 fp = open(FILE, "w")
200 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
201 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000202 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000203 print >>fp, "/* a list of unique database records */"
204 print >>fp, \
205 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000206 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000207 print >>fp, " {%d, %d, %d, %d}," % item
208 print >>fp, "};"
209 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000210
Martin v. Löwis677bde22002-11-23 22:08:15 +0000211 print >>fp, "/* Reindexing of NFC first characters. */"
212 print >>fp, "#define TOTAL_FIRST",total_first
213 print >>fp, "#define TOTAL_LAST",total_last
214 print >>fp, "struct reindex{int start;short count,index;};"
215 print >>fp, "struct reindex nfc_first[] = {"
216 for start,end in comp_first_ranges:
217 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
218 print >>fp," {0,0,0}"
219 print >>fp,"};\n"
220 print >>fp, "struct reindex nfc_last[] = {"
221 for start,end in comp_last_ranges:
222 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
223 print >>fp," {0,0,0}"
224 print >>fp,"};\n"
225
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000226 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000227 # the support code moved into unicodedatabase.c
228
Fred Drake9c685052000-10-26 03:56:46 +0000229 print >>fp, "/* string literals */"
230 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000231 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000232 print >>fp, " \"%s\"," % name
233 print >>fp, " NULL"
234 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000235
Fred Drake9c685052000-10-26 03:56:46 +0000236 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000238 print >>fp, " \"%s\"," % name
239 print >>fp, " NULL"
240 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242 print >>fp, "static const char *decomp_prefix[] = {"
243 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000244 print >>fp, " \"%s\"," % name
245 print >>fp, " NULL"
246 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000247
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000248 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000249 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000250
Fred Drake9c685052000-10-26 03:56:46 +0000251 print >>fp, "/* index tables for the database records */"
252 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253 Array("index1", index1).dump(fp, trace)
254 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000255
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000256 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000257 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000258
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000259 print >>fp, "/* decomposition data */"
260 Array("decomp_data", decomp_data).dump(fp, trace)
261
Fred Drake9c685052000-10-26 03:56:46 +0000262 print >>fp, "/* index tables for the decomposition data */"
263 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000264 Array("decomp_index1", index1).dump(fp, trace)
265 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Martin v. Löwis677bde22002-11-23 22:08:15 +0000267 index, index2, shift = splitbins(comp_data, trace)
268 print >>fp, "/* NFC pairs */"
269 print >>fp, "#define COMP_SHIFT", shift
270 Array("comp_index", index).dump(fp, trace)
271 Array("comp_data", index2).dump(fp, trace)
272
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000273 fp.close()
274
275# --------------------------------------------------------------------
276# unicode character type tables
277
278def makeunicodetype(unicode, trace):
279
280 FILE = "Objects/unicodetype_db.h"
281
282 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283
284 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000285 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000286 table = [dummy]
287 cache = {0: dummy}
288 index = [0] * len(unicode.chars)
289
290 for char in unicode.chars:
291 record = unicode.table[char]
292 if record:
293 # extract database properties
294 category = record[2]
295 bidirectional = record[4]
296 flags = 0
297 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
298 flags |= ALPHA_MASK
299 if category == "Ll":
300 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000301 if category == "Zl" or bidirectional == "B":
302 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000303 if category == "Zs" or bidirectional in ("WS", "B", "S"):
304 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000305 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000306 flags |= TITLE_MASK
307 if category == "Lu":
308 flags |= UPPER_MASK
309 # use delta predictor for upper/lower/title
310 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000311 upper = int(record[12], 16) - char
312 assert -32768 <= upper <= 32767
313 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000314 else:
315 upper = 0
316 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000317 lower = int(record[13], 16) - char
318 assert -32768 <= lower <= 32767
319 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000320 else:
321 lower = 0
322 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000323 title = int(record[14], 16) - char
324 assert -32768 <= lower <= 32767
325 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000326 else:
327 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000328 # decimal digit, integer digit
329 decimal = 0
330 if record[6]:
331 flags |= DECIMAL_MASK
332 decimal = int(record[6])
333 digit = 0
334 if record[7]:
335 flags |= DIGIT_MASK
336 digit = int(record[7])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000337 if record[15] in ('W', 'F'): # Wide or Full width
338 flags |= WIDE_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000339 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000340 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000341 )
342 # add entry to index and item tables
343 i = cache.get(item)
344 if i is None:
345 cache[item] = i = len(table)
346 table.append(item)
347 index[char] = i
348
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000349 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000350
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000351 print "--- Writing", FILE, "..."
352
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000353 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000354 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
355 print >>fp
356 print >>fp, "/* a list of unique character type descriptors */"
357 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000358 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000359 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
360 print >>fp, "};"
361 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000364 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365
Fred Drake9c685052000-10-26 03:56:46 +0000366 print >>fp, "/* type indexes */"
367 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000368 Array("index1", index1).dump(fp, trace)
369 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000371 fp.close()
372
373# --------------------------------------------------------------------
374# unicode name database
375
376def makeunicodename(unicode, trace):
377
378 FILE = "Modules/unicodename_db.h"
379
380 print "--- Preparing", FILE, "..."
381
382 # collect names
383 names = [None] * len(unicode.chars)
384
385 for char in unicode.chars:
386 record = unicode.table[char]
387 if record:
388 name = record[1].strip()
389 if name and name[0] != "<":
390 names[char] = name + chr(0)
391
392 print len(filter(lambda n: n is not None, names)), "distinct names"
393
394 # collect unique words from names (note that we differ between
395 # words inside a sentence, and words ending a sentence. the
396 # latter includes the trailing null byte.
397
398 words = {}
399 n = b = 0
400 for char in unicode.chars:
401 name = names[char]
402 if name:
403 w = name.split()
404 b = b + len(name)
405 n = n + len(w)
406 for w in w:
407 l = words.get(w)
408 if l:
409 l.append(None)
410 else:
411 words[w] = [len(words)]
412
413 print n, "words in text;", b, "bytes"
414
415 wordlist = words.items()
416
Martin v. Löwis97225da2002-11-24 23:05:09 +0000417 # sort on falling frequency, then by name
418 def cmpwords((aword, alist),(bword, blist)):
419 r = -cmp(len(alist),len(blist))
420 if r:
421 return r
422 return cmp(aword, bword)
423 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000424
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 # figure out how many phrasebook escapes we need
426 escapes = 0
427 while escapes * 256 < len(wordlist):
428 escapes = escapes + 1
429 print escapes, "escapes"
430
431 short = 256 - escapes
432
433 assert short > 0
434
435 print short, "short indexes in lexicon"
436
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000437 # statistics
438 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000440 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000442
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 # pick the most commonly used words, and sort the rest on falling
444 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000445
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000447 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
448 wordlist.extend(wordtail)
449
450 # generate lexicon from words
451
452 lexicon_offset = [0]
453 lexicon = ""
454 words = {}
455
456 # build a lexicon string
457 offset = 0
458 for w, x in wordlist:
459 # encoding: bit 7 indicates last character in word (chr(128)
460 # indicates the last character in an entire string)
461 ww = w[:-1] + chr(ord(w[-1])+128)
462 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000463 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000464 if o < 0:
465 o = offset
466 lexicon = lexicon + ww
467 offset = offset + len(w)
468 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000469 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000470
471 lexicon = map(ord, lexicon)
472
473 # generate phrasebook from names and lexicon
474 phrasebook = [0]
475 phrasebook_offset = [0] * len(unicode.chars)
476 for char in unicode.chars:
477 name = names[char]
478 if name:
479 w = name.split()
480 phrasebook_offset[char] = len(phrasebook)
481 for w in w:
482 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000483 if i < short:
484 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000485 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000486 # store as two bytes
487 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000488 phrasebook.append(i&255)
489
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000490 assert getsize(phrasebook) == 1
491
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000492 #
493 # unicode name hash table
494
495 # extract names
496 data = []
497 for char in unicode.chars:
498 record = unicode.table[char]
499 if record:
500 name = record[1].strip()
501 if name and name[0] != "<":
502 data.append((name, char))
503
504 # the magic number 47 was chosen to minimize the number of
505 # collisions on the current data set. if you like, change it
506 # and see what happens...
507
508 codehash = Hash("code", data, 47)
509
510 print "--- Writing", FILE, "..."
511
512 fp = open(FILE, "w")
513 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
514 print >>fp
515 print >>fp, "#define NAME_MAXLEN", 256
516 print >>fp
517 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000518 Array("lexicon", lexicon).dump(fp, trace)
519 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000520
521 # split decomposition index table
522 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
523
524 print >>fp, "/* code->name phrasebook */"
525 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000526 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000527
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000528 Array("phrasebook", phrasebook).dump(fp, trace)
529 Array("phrasebook_offset1", offset1).dump(fp, trace)
530 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000531
532 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000533 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000534
535 fp.close()
536
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000537# --------------------------------------------------------------------
538# the following support code is taken from the unidb utilities
539# Copyright (c) 1999-2000 by Secret Labs AB
540
541# load a unicode-data file from disk
542
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000543import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000544
545class UnicodeData:
546
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000547 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000548 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000549 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000550 while 1:
551 s = file.readline()
552 if not s:
553 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000554 s = s.strip().split(";")
555 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000556 table[char] = s
557
Martin v. Löwis97225da2002-11-24 23:05:09 +0000558 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000559 if expand:
560 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000561 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000562 s = table[i]
563 if s:
564 if s[1][-6:] == "First>":
565 s[1] = ""
566 field = s[:]
567 elif s[1][-5:] == "Last>":
568 s[1] = ""
569 field = None
570 elif field:
571 field[0] = hex(i)
572 table[i] = field
573
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000574 # public attributes
575 self.filename = filename
576 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000577 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000578
Martin v. Löwis677bde22002-11-23 22:08:15 +0000579 file = open(exclusions)
580 self.exclusions = {}
581 for s in file:
582 s = s.strip()
583 if not s:
584 continue
585 if s[0] == '#':
586 continue
587 char = int(s.split()[0],16)
588 self.exclusions[char] = 1
589
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000590 widths = [None] * 0x110000
591 for s in open(eastasianwidth):
592 s = s.strip()
593 if not s:
594 continue
595 if s[0] == '#':
596 continue
597 s = s.split()[0].split(';')
598 if '..' in s[0]:
599 first, last = [int(c, 16) for c in s[0].split('..')]
600 chars = range(first, last+1)
601 else:
602 chars = [int(s[0], 16)]
603 for char in chars:
604 widths[char] = s[1]
605 for i in range(0, 0x110000):
606 if table[i] is not None:
607 table[i].append(widths[i])
608
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000609 def uselatin1(self):
610 # restrict character range to ISO Latin 1
611 self.chars = range(256)
612
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613# hash table tools
614
615# this is a straight-forward reimplementation of Python's built-in
616# dictionary type, using a static data structure, and a custom string
617# hash algorithm.
618
619def myhash(s, magic):
620 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000621 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000623 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000624 if ix:
625 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
626 return h
627
628SIZES = [
629 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
630 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
631 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
632 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
633]
634
635class Hash:
636 def __init__(self, name, data, magic):
637 # turn a (key, value) list into a static hash table structure
638
639 # determine table size
640 for size, poly in SIZES:
641 if size > len(data):
642 poly = size + poly
643 break
644 else:
645 raise AssertionError, "ran out of polynominals"
646
647 print size, "slots in hash table"
648
649 table = [None] * size
650
651 mask = size-1
652
653 n = 0
654
655 hash = myhash
656
657 # initialize hash table
658 for key, value in data:
659 h = hash(key, magic)
660 i = (~h) & mask
661 v = table[i]
662 if v is None:
663 table[i] = value
664 continue
665 incr = (h ^ (h >> 3)) & mask;
666 if not incr:
667 incr = mask
668 while 1:
669 n = n + 1
670 i = (i + incr) & mask
671 v = table[i]
672 if v is None:
673 table[i] = value
674 break
675 incr = incr << 1
676 if incr > mask:
677 incr = incr ^ poly
678
679 print n, "collisions"
680 self.collisions = n
681
682 for i in range(len(table)):
683 if table[i] is None:
684 table[i] = 0
685
686 self.data = Array(name + "_hash", table)
687 self.magic = magic
688 self.name = name
689 self.size = size
690 self.poly = poly
691
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000692 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000693 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000694 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000695 file.write("#define %s_magic %d\n" % (self.name, self.magic))
696 file.write("#define %s_size %d\n" % (self.name, self.size))
697 file.write("#define %s_poly %d\n" % (self.name, self.poly))
698
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000699# stuff to deal with arrays of unsigned integers
700
701class Array:
702
703 def __init__(self, name, data):
704 self.name = name
705 self.data = data
706
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000707 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000708 # write data to file, as a C array
709 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000710 if trace:
711 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000712 file.write("static ")
713 if size == 1:
714 file.write("unsigned char")
715 elif size == 2:
716 file.write("unsigned short")
717 else:
718 file.write("unsigned int")
719 file.write(" " + self.name + "[] = {\n")
720 if self.data:
721 s = " "
722 for item in self.data:
723 i = str(item) + ", "
724 if len(s) + len(i) > 78:
725 file.write(s + "\n")
726 s = " " + i
727 else:
728 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000729 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000730 file.write(s + "\n")
731 file.write("};\n\n")
732
733def getsize(data):
734 # return smallest possible integer size for the given array
735 maxdata = max(data)
736 if maxdata < 256:
737 return 1
738 elif maxdata < 65536:
739 return 2
740 else:
741 return 4
742
Tim Peters21013482000-09-25 07:13:41 +0000743def splitbins(t, trace=0):
744 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
745
746 t is a sequence of ints. This function can be useful to save space if
747 many of the ints are the same. t1 and t2 are lists of ints, and shift
748 is an int, chosen to minimize the combined size of t1 and t2 (in C
749 code), and where for each i in range(len(t)),
750 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
751 where mask is a bitmask isolating the last "shift" bits.
752
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000753 If optional arg trace is non-zero (default zero), progress info
754 is printed to sys.stderr. The higher the value, the more info
755 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000756 """
757
758 import sys
759 if trace:
760 def dump(t1, t2, shift, bytes):
761 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
762 len(t1), len(t2), shift, bytes)
763 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
764 "bytes"
765 n = len(t)-1 # last valid index
766 maxshift = 0 # the most we can shift n and still have something left
767 if n > 0:
768 while n >> 1:
769 n >>= 1
770 maxshift += 1
771 del n
772 bytes = sys.maxint # smallest total size so far
773 t = tuple(t) # so slices can be dict keys
774 for shift in range(maxshift + 1):
775 t1 = []
776 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000777 size = 2**shift
778 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000779 for i in range(0, len(t), size):
780 bin = t[i:i+size]
781 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000782 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000783 index = len(t2)
784 bincache[bin] = index
785 t2.extend(bin)
786 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000787 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000788 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000789 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000790 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000791 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000792 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000793 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000794 t1, t2, shift = best
795 if trace:
796 print >>sys.stderr, "Best:",
797 dump(t1, t2, shift, bytes)
798 if __debug__:
799 # exhaustively verify that the decomposition is correct
800 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
801 for i in xrange(len(t)):
802 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
803 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000804
805if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000806 maketables(1)