blob: 7186780f81e157ab62951e0ae007bc4e92cd0d05 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Fredrik Lundhcfcea492000-09-25 08:07:06 +000022#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000023# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000024#
25
26import sys
27
28SCRIPT = sys.argv[0]
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000029VERSION = "2.3"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000030
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000031# The Unicode Database
32UNIDATA_VERSION = "3.2.0"
Martin v. Löwis677bde22002-11-23 22:08:15 +000033UNICODE_DATA = "UnicodeData.txt"
34COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000035EASTASIAN_WIDTH = "EastAsianWidth.txt"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
38 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
39 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
40 "So" ]
41
42BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
43 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
44 "ON" ]
45
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000046EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
47
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000048# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000049ALPHA_MASK = 0x01
50DECIMAL_MASK = 0x02
51DIGIT_MASK = 0x04
52LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054SPACE_MASK = 0x20
55TITLE_MASK = 0x40
56UPPER_MASK = 0x80
57
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000058def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000059
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000060 print "--- Reading", UNICODE_DATA, "..."
61
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000062 unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
63 EASTASIAN_WIDTH)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000065 print len(filter(None, unicode.table)), "characters"
66
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000067 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000069 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000070
71# --------------------------------------------------------------------
72# unicode character properties
73
74def makeunicodedata(unicode, trace):
75
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000076 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077 table = [dummy]
78 cache = {0: dummy}
79 index = [0] * len(unicode.chars)
80
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000081 FILE = "Modules/unicodedata_db.h"
82
83 print "--- Preparing", FILE, "..."
84
Fredrik Lundhcfcea492000-09-25 08:07:06 +000085 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000086
Fredrik Lundhf367cac2000-09-24 23:18:31 +000087 for char in unicode.chars:
88 record = unicode.table[char]
89 if record:
90 # extract database properties
91 category = CATEGORY_NAMES.index(record[2])
92 combining = int(record[3])
93 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
94 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000095 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +000096 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000097 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +000098 )
99 # add entry to index and item tables
100 i = cache.get(item)
101 if i is None:
102 cache[item] = i = len(table)
103 table.append(item)
104 index[char] = i
105
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000106 # 2) decomposition data
107
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000108 decomp_data = [0]
109 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000110 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000111 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000112
Martin v. Löwis677bde22002-11-23 22:08:15 +0000113 comp_pairs = []
114 comp_first = [None] * len(unicode.chars)
115 comp_last = [None] * len(unicode.chars)
116
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000117 for char in unicode.chars:
118 record = unicode.table[char]
119 if record:
120 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000121 decomp = record[5].split()
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000122 # prefix
123 if decomp[0][0] == "<":
124 prefix = decomp.pop(0)
125 else:
126 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000127 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 i = decomp_prefix.index(prefix)
129 except ValueError:
130 i = len(decomp_prefix)
131 decomp_prefix.append(prefix)
132 prefix = i
133 assert prefix < 256
134 # content
135 decomp = [prefix + (len(decomp)<<8)] +\
136 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000137 # Collect NFC pairs
138 if not prefix and len(decomp) == 3 and \
139 char not in unicode.exclusions and \
140 unicode.table[decomp[1]][3] == "0":
141 p, l, r = decomp
142 comp_first[l] = 1
143 comp_last[r] = 1
144 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000145 try:
146 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000147 except ValueError:
148 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 decomp_data.extend(decomp)
150 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000151 else:
152 i = 0
153 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000154
Martin v. Löwis677bde22002-11-23 22:08:15 +0000155 f = l = 0
156 comp_first_ranges = []
157 comp_last_ranges = []
158 prev_f = prev_l = None
159 for i in unicode.chars:
160 if comp_first[i] is not None:
161 comp_first[i] = f
162 f += 1
163 if prev_f is None:
164 prev_f = (i,i)
165 elif prev_f[1]+1 == i:
166 prev_f = prev_f[0],i
167 else:
168 comp_first_ranges.append(prev_f)
169 prev_f = (i,i)
170 if comp_last[i] is not None:
171 comp_last[i] = l
172 l += 1
173 if prev_l is None:
174 prev_l = (i,i)
175 elif prev_l[1]+1 == i:
176 prev_l = prev_l[0],i
177 else:
178 comp_last_ranges.append(prev_l)
179 prev_l = (i,i)
180 comp_first_ranges.append(prev_f)
181 comp_last_ranges.append(prev_l)
182 total_first = f
183 total_last = l
184
185 comp_data = [0]*(total_first*total_last)
186 for f,l,char in comp_pairs:
187 f = comp_first[f]
188 l = comp_last[l]
189 comp_data[f*total_last+l] = char
190
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000191 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 print len(decomp_prefix), "unique decomposition prefixes"
193 print len(decomp_data), "unique decomposition entries:",
194 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000195 print total_first, "first characters in NFC"
196 print total_last, "last characters in NFC"
197 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000198
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000199 print "--- Writing", FILE, "..."
200
Fred Drake9c685052000-10-26 03:56:46 +0000201 fp = open(FILE, "w")
202 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
203 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000204 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000205 print >>fp, "/* a list of unique database records */"
206 print >>fp, \
207 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000208 for item in table:
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000209 print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000210 print >>fp, "};"
211 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000212
Martin v. Löwis677bde22002-11-23 22:08:15 +0000213 print >>fp, "/* Reindexing of NFC first characters. */"
214 print >>fp, "#define TOTAL_FIRST",total_first
215 print >>fp, "#define TOTAL_LAST",total_last
216 print >>fp, "struct reindex{int start;short count,index;};"
217 print >>fp, "struct reindex nfc_first[] = {"
218 for start,end in comp_first_ranges:
219 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
220 print >>fp," {0,0,0}"
221 print >>fp,"};\n"
222 print >>fp, "struct reindex nfc_last[] = {"
223 for start,end in comp_last_ranges:
224 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
225 print >>fp," {0,0,0}"
226 print >>fp,"};\n"
227
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000228 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000229 # the support code moved into unicodedatabase.c
230
Fred Drake9c685052000-10-26 03:56:46 +0000231 print >>fp, "/* string literals */"
232 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000234 print >>fp, " \"%s\"," % name
235 print >>fp, " NULL"
236 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237
Fred Drake9c685052000-10-26 03:56:46 +0000238 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000239 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000240 print >>fp, " \"%s\"," % name
241 print >>fp, " NULL"
242 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000243
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000244 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
245 for name in EASTASIANWIDTH_NAMES:
246 print >>fp, " \"%s\"," % name
247 print >>fp, " NULL"
248 print >>fp, "};"
249
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000250 print >>fp, "static const char *decomp_prefix[] = {"
251 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000252 print >>fp, " \"%s\"," % name
253 print >>fp, " NULL"
254 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000255
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000256 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000257 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258
Fred Drake9c685052000-10-26 03:56:46 +0000259 print >>fp, "/* index tables for the database records */"
260 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000261 Array("index1", index1).dump(fp, trace)
262 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000264 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000265 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000266
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000267 print >>fp, "/* decomposition data */"
268 Array("decomp_data", decomp_data).dump(fp, trace)
269
Fred Drake9c685052000-10-26 03:56:46 +0000270 print >>fp, "/* index tables for the decomposition data */"
271 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272 Array("decomp_index1", index1).dump(fp, trace)
273 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000274
Martin v. Löwis677bde22002-11-23 22:08:15 +0000275 index, index2, shift = splitbins(comp_data, trace)
276 print >>fp, "/* NFC pairs */"
277 print >>fp, "#define COMP_SHIFT", shift
278 Array("comp_index", index).dump(fp, trace)
279 Array("comp_data", index2).dump(fp, trace)
280
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000281 fp.close()
282
283# --------------------------------------------------------------------
284# unicode character type tables
285
286def makeunicodetype(unicode, trace):
287
288 FILE = "Objects/unicodetype_db.h"
289
290 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291
292 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000293 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000294 table = [dummy]
295 cache = {0: dummy}
296 index = [0] * len(unicode.chars)
297
298 for char in unicode.chars:
299 record = unicode.table[char]
300 if record:
301 # extract database properties
302 category = record[2]
303 bidirectional = record[4]
304 flags = 0
305 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
306 flags |= ALPHA_MASK
307 if category == "Ll":
308 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000309 if category == "Zl" or bidirectional == "B":
310 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000311 if category == "Zs" or bidirectional in ("WS", "B", "S"):
312 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000313 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000314 flags |= TITLE_MASK
315 if category == "Lu":
316 flags |= UPPER_MASK
317 # use delta predictor for upper/lower/title
318 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000319 upper = int(record[12], 16) - char
320 assert -32768 <= upper <= 32767
321 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000322 else:
323 upper = 0
324 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000325 lower = int(record[13], 16) - char
326 assert -32768 <= lower <= 32767
327 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000328 else:
329 lower = 0
330 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000331 title = int(record[14], 16) - char
332 assert -32768 <= lower <= 32767
333 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000334 else:
335 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000336 # decimal digit, integer digit
337 decimal = 0
338 if record[6]:
339 flags |= DECIMAL_MASK
340 decimal = int(record[6])
341 digit = 0
342 if record[7]:
343 flags |= DIGIT_MASK
344 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000345 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000346 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000347 )
348 # add entry to index and item tables
349 i = cache.get(item)
350 if i is None:
351 cache[item] = i = len(table)
352 table.append(item)
353 index[char] = i
354
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000355 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000356
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000357 print "--- Writing", FILE, "..."
358
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000359 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000360 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
361 print >>fp
362 print >>fp, "/* a list of unique character type descriptors */"
363 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000365 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
366 print >>fp, "};"
367 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000368
369 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000370 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371
Fred Drake9c685052000-10-26 03:56:46 +0000372 print >>fp, "/* type indexes */"
373 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000374 Array("index1", index1).dump(fp, trace)
375 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000376
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000377 fp.close()
378
379# --------------------------------------------------------------------
380# unicode name database
381
382def makeunicodename(unicode, trace):
383
384 FILE = "Modules/unicodename_db.h"
385
386 print "--- Preparing", FILE, "..."
387
388 # collect names
389 names = [None] * len(unicode.chars)
390
391 for char in unicode.chars:
392 record = unicode.table[char]
393 if record:
394 name = record[1].strip()
395 if name and name[0] != "<":
396 names[char] = name + chr(0)
397
398 print len(filter(lambda n: n is not None, names)), "distinct names"
399
400 # collect unique words from names (note that we differ between
401 # words inside a sentence, and words ending a sentence. the
402 # latter includes the trailing null byte.
403
404 words = {}
405 n = b = 0
406 for char in unicode.chars:
407 name = names[char]
408 if name:
409 w = name.split()
410 b = b + len(name)
411 n = n + len(w)
412 for w in w:
413 l = words.get(w)
414 if l:
415 l.append(None)
416 else:
417 words[w] = [len(words)]
418
419 print n, "words in text;", b, "bytes"
420
421 wordlist = words.items()
422
Martin v. Löwis97225da2002-11-24 23:05:09 +0000423 # sort on falling frequency, then by name
424 def cmpwords((aword, alist),(bword, blist)):
425 r = -cmp(len(alist),len(blist))
426 if r:
427 return r
428 return cmp(aword, bword)
429 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000430
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000431 # figure out how many phrasebook escapes we need
432 escapes = 0
433 while escapes * 256 < len(wordlist):
434 escapes = escapes + 1
435 print escapes, "escapes"
436
437 short = 256 - escapes
438
439 assert short > 0
440
441 print short, "short indexes in lexicon"
442
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000443 # statistics
444 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000446 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000448
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449 # pick the most commonly used words, and sort the rest on falling
450 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000451
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000452 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000453 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
454 wordlist.extend(wordtail)
455
456 # generate lexicon from words
457
458 lexicon_offset = [0]
459 lexicon = ""
460 words = {}
461
462 # build a lexicon string
463 offset = 0
464 for w, x in wordlist:
465 # encoding: bit 7 indicates last character in word (chr(128)
466 # indicates the last character in an entire string)
467 ww = w[:-1] + chr(ord(w[-1])+128)
468 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000469 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000470 if o < 0:
471 o = offset
472 lexicon = lexicon + ww
473 offset = offset + len(w)
474 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000475 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000476
477 lexicon = map(ord, lexicon)
478
479 # generate phrasebook from names and lexicon
480 phrasebook = [0]
481 phrasebook_offset = [0] * len(unicode.chars)
482 for char in unicode.chars:
483 name = names[char]
484 if name:
485 w = name.split()
486 phrasebook_offset[char] = len(phrasebook)
487 for w in w:
488 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000489 if i < short:
490 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000491 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000492 # store as two bytes
493 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000494 phrasebook.append(i&255)
495
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000496 assert getsize(phrasebook) == 1
497
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000498 #
499 # unicode name hash table
500
501 # extract names
502 data = []
503 for char in unicode.chars:
504 record = unicode.table[char]
505 if record:
506 name = record[1].strip()
507 if name and name[0] != "<":
508 data.append((name, char))
509
510 # the magic number 47 was chosen to minimize the number of
511 # collisions on the current data set. if you like, change it
512 # and see what happens...
513
514 codehash = Hash("code", data, 47)
515
516 print "--- Writing", FILE, "..."
517
518 fp = open(FILE, "w")
519 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
520 print >>fp
521 print >>fp, "#define NAME_MAXLEN", 256
522 print >>fp
523 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000524 Array("lexicon", lexicon).dump(fp, trace)
525 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000526
527 # split decomposition index table
528 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
529
530 print >>fp, "/* code->name phrasebook */"
531 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000532 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000533
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000534 Array("phrasebook", phrasebook).dump(fp, trace)
535 Array("phrasebook_offset1", offset1).dump(fp, trace)
536 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000537
538 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000539 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000540
541 fp.close()
542
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000543# --------------------------------------------------------------------
544# the following support code is taken from the unidb utilities
545# Copyright (c) 1999-2000 by Secret Labs AB
546
547# load a unicode-data file from disk
548
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000549import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000550
551class UnicodeData:
552
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000553 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000554 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000555 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000556 while 1:
557 s = file.readline()
558 if not s:
559 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000560 s = s.strip().split(";")
561 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000562 table[char] = s
563
Martin v. Löwis97225da2002-11-24 23:05:09 +0000564 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000565 if expand:
566 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000567 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000568 s = table[i]
569 if s:
570 if s[1][-6:] == "First>":
571 s[1] = ""
572 field = s[:]
573 elif s[1][-5:] == "Last>":
574 s[1] = ""
575 field = None
576 elif field:
577 field[0] = hex(i)
578 table[i] = field
579
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000580 # public attributes
581 self.filename = filename
582 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000583 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000584
Martin v. Löwis677bde22002-11-23 22:08:15 +0000585 file = open(exclusions)
586 self.exclusions = {}
587 for s in file:
588 s = s.strip()
589 if not s:
590 continue
591 if s[0] == '#':
592 continue
593 char = int(s.split()[0],16)
594 self.exclusions[char] = 1
595
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000596 widths = [None] * 0x110000
597 for s in open(eastasianwidth):
598 s = s.strip()
599 if not s:
600 continue
601 if s[0] == '#':
602 continue
603 s = s.split()[0].split(';')
604 if '..' in s[0]:
605 first, last = [int(c, 16) for c in s[0].split('..')]
606 chars = range(first, last+1)
607 else:
608 chars = [int(s[0], 16)]
609 for char in chars:
610 widths[char] = s[1]
611 for i in range(0, 0x110000):
612 if table[i] is not None:
613 table[i].append(widths[i])
614
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000615 def uselatin1(self):
616 # restrict character range to ISO Latin 1
617 self.chars = range(256)
618
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000619# hash table tools
620
621# this is a straight-forward reimplementation of Python's built-in
622# dictionary type, using a static data structure, and a custom string
623# hash algorithm.
624
625def myhash(s, magic):
626 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000627 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000629 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630 if ix:
631 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
632 return h
633
634SIZES = [
635 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
636 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
637 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
638 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
639]
640
641class Hash:
642 def __init__(self, name, data, magic):
643 # turn a (key, value) list into a static hash table structure
644
645 # determine table size
646 for size, poly in SIZES:
647 if size > len(data):
648 poly = size + poly
649 break
650 else:
651 raise AssertionError, "ran out of polynominals"
652
653 print size, "slots in hash table"
654
655 table = [None] * size
656
657 mask = size-1
658
659 n = 0
660
661 hash = myhash
662
663 # initialize hash table
664 for key, value in data:
665 h = hash(key, magic)
666 i = (~h) & mask
667 v = table[i]
668 if v is None:
669 table[i] = value
670 continue
671 incr = (h ^ (h >> 3)) & mask;
672 if not incr:
673 incr = mask
674 while 1:
675 n = n + 1
676 i = (i + incr) & mask
677 v = table[i]
678 if v is None:
679 table[i] = value
680 break
681 incr = incr << 1
682 if incr > mask:
683 incr = incr ^ poly
684
685 print n, "collisions"
686 self.collisions = n
687
688 for i in range(len(table)):
689 if table[i] is None:
690 table[i] = 0
691
692 self.data = Array(name + "_hash", table)
693 self.magic = magic
694 self.name = name
695 self.size = size
696 self.poly = poly
697
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000698 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000699 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000700 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701 file.write("#define %s_magic %d\n" % (self.name, self.magic))
702 file.write("#define %s_size %d\n" % (self.name, self.size))
703 file.write("#define %s_poly %d\n" % (self.name, self.poly))
704
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000705# stuff to deal with arrays of unsigned integers
706
707class Array:
708
709 def __init__(self, name, data):
710 self.name = name
711 self.data = data
712
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000713 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000714 # write data to file, as a C array
715 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000716 if trace:
717 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000718 file.write("static ")
719 if size == 1:
720 file.write("unsigned char")
721 elif size == 2:
722 file.write("unsigned short")
723 else:
724 file.write("unsigned int")
725 file.write(" " + self.name + "[] = {\n")
726 if self.data:
727 s = " "
728 for item in self.data:
729 i = str(item) + ", "
730 if len(s) + len(i) > 78:
731 file.write(s + "\n")
732 s = " " + i
733 else:
734 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000735 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000736 file.write(s + "\n")
737 file.write("};\n\n")
738
739def getsize(data):
740 # return smallest possible integer size for the given array
741 maxdata = max(data)
742 if maxdata < 256:
743 return 1
744 elif maxdata < 65536:
745 return 2
746 else:
747 return 4
748
Tim Peters21013482000-09-25 07:13:41 +0000749def splitbins(t, trace=0):
750 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
751
752 t is a sequence of ints. This function can be useful to save space if
753 many of the ints are the same. t1 and t2 are lists of ints, and shift
754 is an int, chosen to minimize the combined size of t1 and t2 (in C
755 code), and where for each i in range(len(t)),
756 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
757 where mask is a bitmask isolating the last "shift" bits.
758
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000759 If optional arg trace is non-zero (default zero), progress info
760 is printed to sys.stderr. The higher the value, the more info
761 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000762 """
763
764 import sys
765 if trace:
766 def dump(t1, t2, shift, bytes):
767 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
768 len(t1), len(t2), shift, bytes)
769 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
770 "bytes"
771 n = len(t)-1 # last valid index
772 maxshift = 0 # the most we can shift n and still have something left
773 if n > 0:
774 while n >> 1:
775 n >>= 1
776 maxshift += 1
777 del n
778 bytes = sys.maxint # smallest total size so far
779 t = tuple(t) # so slices can be dict keys
780 for shift in range(maxshift + 1):
781 t1 = []
782 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000783 size = 2**shift
784 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000785 for i in range(0, len(t), size):
786 bin = t[i:i+size]
787 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000788 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000789 index = len(t2)
790 bincache[bin] = index
791 t2.extend(bin)
792 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000793 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000794 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000795 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000796 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000797 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000798 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000799 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000800 t1, t2, shift = best
801 if trace:
802 print >>sys.stderr, "Best:",
803 dump(t1, t2, shift, bytes)
804 if __debug__:
805 # exhaustively verify that the decomposition is correct
806 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
807 for i in xrange(len(t)):
808 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
809 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000810
811if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000812 maketables(1)