blob: 2a85e8405c5bd046ae0a7b1f0af827908bec0e22 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000030VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033UNIDATA_VERSION = "4.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000049EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000051# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000052ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000056LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000057SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
60
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000061def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000062
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000063 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000064
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000065 version = ""
66 unicode = UnicodeData(UNICODE_DATA % version,
67 COMPOSITION_EXCLUSIONS % version,
68 EASTASIAN_WIDTH % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000070 print len(filter(None, unicode.table)), "characters"
71
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 for version in old_versions:
73 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
74 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75 COMPOSITION_EXCLUSIONS % ("-"+version),
76 EASTASIAN_WIDTH % ("-"+version))
77 print len(filter(None, old_unicode.table)), "characters"
78 merge_old_version(version, unicode, old_unicode)
79
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000080 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000081 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000082 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000083
84# --------------------------------------------------------------------
85# unicode character properties
86
87def makeunicodedata(unicode, trace):
88
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000089 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000090 table = [dummy]
91 cache = {0: dummy}
92 index = [0] * len(unicode.chars)
93
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000094 FILE = "Modules/unicodedata_db.h"
95
96 print "--- Preparing", FILE, "..."
97
Fredrik Lundhcfcea492000-09-25 08:07:06 +000098 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100 for char in unicode.chars:
101 record = unicode.table[char]
102 if record:
103 # extract database properties
104 category = CATEGORY_NAMES.index(record[2])
105 combining = int(record[3])
106 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000108 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000110 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000111 )
112 # add entry to index and item tables
113 i = cache.get(item)
114 if i is None:
115 cache[item] = i = len(table)
116 table.append(item)
117 index[char] = i
118
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000119 # 2) decomposition data
120
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121 decomp_data = [0]
122 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000123 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000124 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000125
Martin v. Löwis677bde22002-11-23 22:08:15 +0000126 comp_pairs = []
127 comp_first = [None] * len(unicode.chars)
128 comp_last = [None] * len(unicode.chars)
129
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000130 for char in unicode.chars:
131 record = unicode.table[char]
132 if record:
133 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000134 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135 if len(decomp) > 19:
136 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 # prefix
138 if decomp[0][0] == "<":
139 prefix = decomp.pop(0)
140 else:
141 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 i = decomp_prefix.index(prefix)
144 except ValueError:
145 i = len(decomp_prefix)
146 decomp_prefix.append(prefix)
147 prefix = i
148 assert prefix < 256
149 # content
150 decomp = [prefix + (len(decomp)<<8)] +\
151 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000152 # Collect NFC pairs
153 if not prefix and len(decomp) == 3 and \
154 char not in unicode.exclusions and \
155 unicode.table[decomp[1]][3] == "0":
156 p, l, r = decomp
157 comp_first[l] = 1
158 comp_last[r] = 1
159 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 try:
161 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000162 except ValueError:
163 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 decomp_data.extend(decomp)
165 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166 else:
167 i = 0
168 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000169
Martin v. Löwis677bde22002-11-23 22:08:15 +0000170 f = l = 0
171 comp_first_ranges = []
172 comp_last_ranges = []
173 prev_f = prev_l = None
174 for i in unicode.chars:
175 if comp_first[i] is not None:
176 comp_first[i] = f
177 f += 1
178 if prev_f is None:
179 prev_f = (i,i)
180 elif prev_f[1]+1 == i:
181 prev_f = prev_f[0],i
182 else:
183 comp_first_ranges.append(prev_f)
184 prev_f = (i,i)
185 if comp_last[i] is not None:
186 comp_last[i] = l
187 l += 1
188 if prev_l is None:
189 prev_l = (i,i)
190 elif prev_l[1]+1 == i:
191 prev_l = prev_l[0],i
192 else:
193 comp_last_ranges.append(prev_l)
194 prev_l = (i,i)
195 comp_first_ranges.append(prev_f)
196 comp_last_ranges.append(prev_l)
197 total_first = f
198 total_last = l
199
200 comp_data = [0]*(total_first*total_last)
201 for f,l,char in comp_pairs:
202 f = comp_first[f]
203 l = comp_last[l]
204 comp_data[f*total_last+l] = char
205
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000206 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207 print len(decomp_prefix), "unique decomposition prefixes"
208 print len(decomp_data), "unique decomposition entries:",
209 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000210 print total_first, "first characters in NFC"
211 print total_last, "last characters in NFC"
212 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000213
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000214 print "--- Writing", FILE, "..."
215
Fred Drake9c685052000-10-26 03:56:46 +0000216 fp = open(FILE, "w")
217 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
218 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000219 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000220 print >>fp, "/* a list of unique database records */"
221 print >>fp, \
222 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000223 for item in table:
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000224 print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000225 print >>fp, "};"
226 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000227
Martin v. Löwis677bde22002-11-23 22:08:15 +0000228 print >>fp, "/* Reindexing of NFC first characters. */"
229 print >>fp, "#define TOTAL_FIRST",total_first
230 print >>fp, "#define TOTAL_LAST",total_last
231 print >>fp, "struct reindex{int start;short count,index;};"
232 print >>fp, "struct reindex nfc_first[] = {"
233 for start,end in comp_first_ranges:
234 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
235 print >>fp," {0,0,0}"
236 print >>fp,"};\n"
237 print >>fp, "struct reindex nfc_last[] = {"
238 for start,end in comp_last_ranges:
239 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
240 print >>fp," {0,0,0}"
241 print >>fp,"};\n"
242
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000243 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000244 # the support code moved into unicodedatabase.c
245
Fred Drake9c685052000-10-26 03:56:46 +0000246 print >>fp, "/* string literals */"
247 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000248 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000249 print >>fp, " \"%s\"," % name
250 print >>fp, " NULL"
251 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000252
Fred Drake9c685052000-10-26 03:56:46 +0000253 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000254 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000255 print >>fp, " \"%s\"," % name
256 print >>fp, " NULL"
257 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000259 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260 for name in EASTASIANWIDTH_NAMES:
261 print >>fp, " \"%s\"," % name
262 print >>fp, " NULL"
263 print >>fp, "};"
264
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000265 print >>fp, "static const char *decomp_prefix[] = {"
266 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000267 print >>fp, " \"%s\"," % name
268 print >>fp, " NULL"
269 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000270
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000271 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000272 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000273
Fred Drake9c685052000-10-26 03:56:46 +0000274 print >>fp, "/* index tables for the database records */"
275 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000276 Array("index1", index1).dump(fp, trace)
277 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000278
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000279 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000280 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000281
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282 print >>fp, "/* decomposition data */"
283 Array("decomp_data", decomp_data).dump(fp, trace)
284
Fred Drake9c685052000-10-26 03:56:46 +0000285 print >>fp, "/* index tables for the decomposition data */"
286 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000287 Array("decomp_index1", index1).dump(fp, trace)
288 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000289
Martin v. Löwis677bde22002-11-23 22:08:15 +0000290 index, index2, shift = splitbins(comp_data, trace)
291 print >>fp, "/* NFC pairs */"
292 print >>fp, "#define COMP_SHIFT", shift
293 Array("comp_index", index).dump(fp, trace)
294 Array("comp_data", index2).dump(fp, trace)
295
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 # Generate delta tables for old versions
297 for version, table, normalization in unicode.changed:
298 cversion = version.replace(".","_")
299 records = [table[0]]
300 cache = {table[0]:0}
301 index = [0] * len(table)
302 for i, record in enumerate(table):
303 try:
304 index[i] = cache[record]
305 except KeyError:
306 index[i] = cache[record] = len(records)
307 records.append(record)
308 index1, index2, shift = splitbins(index, trace)
309 print >>fp, "static const change_record change_records_%s[] = {" % cversion
310 for record in records:
311 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
312 print >>fp, "};"
313 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
314 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
315 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
316 print >>fp, "{"
317 print >>fp, "\tint index;"
318 print >>fp, "\tif (n >= 0x110000) index = 0;"
319 print >>fp, "\telse {"
320 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
321 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322 (cversion, shift, ((1<<shift)-1))
323 print >>fp, "\t}"
324 print >>fp, "\treturn change_records_%s+index;" % cversion
325 print >>fp, "}\n"
326 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
327 print >>fp, "{"
328 print >>fp, "\tswitch(n) {"
329 for k, v in normalization:
330 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
331 print >>fp, "\tdefault: return 0;"
332 print >>fp, "\t}\n}\n"
333
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000334 fp.close()
335
336# --------------------------------------------------------------------
337# unicode character type tables
338
339def makeunicodetype(unicode, trace):
340
341 FILE = "Objects/unicodetype_db.h"
342
343 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000344
345 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000346 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000347 table = [dummy]
348 cache = {0: dummy}
349 index = [0] * len(unicode.chars)
350
351 for char in unicode.chars:
352 record = unicode.table[char]
353 if record:
354 # extract database properties
355 category = record[2]
356 bidirectional = record[4]
357 flags = 0
358 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
359 flags |= ALPHA_MASK
360 if category == "Ll":
361 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000362 if category == "Zl" or bidirectional == "B":
363 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 if category == "Zs" or bidirectional in ("WS", "B", "S"):
365 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000366 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000367 flags |= TITLE_MASK
368 if category == "Lu":
369 flags |= UPPER_MASK
370 # use delta predictor for upper/lower/title
371 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000372 upper = int(record[12], 16) - char
373 assert -32768 <= upper <= 32767
374 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 else:
376 upper = 0
377 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000378 lower = int(record[13], 16) - char
379 assert -32768 <= lower <= 32767
380 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 else:
382 lower = 0
383 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000384 title = int(record[14], 16) - char
385 assert -32768 <= lower <= 32767
386 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 else:
388 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000389 # decimal digit, integer digit
390 decimal = 0
391 if record[6]:
392 flags |= DECIMAL_MASK
393 decimal = int(record[6])
394 digit = 0
395 if record[7]:
396 flags |= DIGIT_MASK
397 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000399 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 )
401 # add entry to index and item tables
402 i = cache.get(item)
403 if i is None:
404 cache[item] = i = len(table)
405 table.append(item)
406 index[char] = i
407
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000408 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000410 print "--- Writing", FILE, "..."
411
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000412 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000413 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
414 print >>fp
415 print >>fp, "/* a list of unique character type descriptors */"
416 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000417 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000418 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
419 print >>fp, "};"
420 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421
422 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000423 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000424
Fred Drake9c685052000-10-26 03:56:46 +0000425 print >>fp, "/* type indexes */"
426 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000427 Array("index1", index1).dump(fp, trace)
428 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000429
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000430 fp.close()
431
432# --------------------------------------------------------------------
433# unicode name database
434
435def makeunicodename(unicode, trace):
436
437 FILE = "Modules/unicodename_db.h"
438
439 print "--- Preparing", FILE, "..."
440
441 # collect names
442 names = [None] * len(unicode.chars)
443
444 for char in unicode.chars:
445 record = unicode.table[char]
446 if record:
447 name = record[1].strip()
448 if name and name[0] != "<":
449 names[char] = name + chr(0)
450
451 print len(filter(lambda n: n is not None, names)), "distinct names"
452
453 # collect unique words from names (note that we differ between
454 # words inside a sentence, and words ending a sentence. the
455 # latter includes the trailing null byte.
456
457 words = {}
458 n = b = 0
459 for char in unicode.chars:
460 name = names[char]
461 if name:
462 w = name.split()
463 b = b + len(name)
464 n = n + len(w)
465 for w in w:
466 l = words.get(w)
467 if l:
468 l.append(None)
469 else:
470 words[w] = [len(words)]
471
472 print n, "words in text;", b, "bytes"
473
474 wordlist = words.items()
475
Martin v. Löwis97225da2002-11-24 23:05:09 +0000476 # sort on falling frequency, then by name
477 def cmpwords((aword, alist),(bword, blist)):
478 r = -cmp(len(alist),len(blist))
479 if r:
480 return r
481 return cmp(aword, bword)
482 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000483
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000484 # figure out how many phrasebook escapes we need
485 escapes = 0
486 while escapes * 256 < len(wordlist):
487 escapes = escapes + 1
488 print escapes, "escapes"
489
490 short = 256 - escapes
491
492 assert short > 0
493
494 print short, "short indexes in lexicon"
495
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000496 # statistics
497 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000498 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000499 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000500 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000501
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000502 # pick the most commonly used words, and sort the rest on falling
503 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000504
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000505 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000506 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
507 wordlist.extend(wordtail)
508
509 # generate lexicon from words
510
511 lexicon_offset = [0]
512 lexicon = ""
513 words = {}
514
515 # build a lexicon string
516 offset = 0
517 for w, x in wordlist:
518 # encoding: bit 7 indicates last character in word (chr(128)
519 # indicates the last character in an entire string)
520 ww = w[:-1] + chr(ord(w[-1])+128)
521 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000522 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000523 if o < 0:
524 o = offset
525 lexicon = lexicon + ww
526 offset = offset + len(w)
527 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000528 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000529
530 lexicon = map(ord, lexicon)
531
532 # generate phrasebook from names and lexicon
533 phrasebook = [0]
534 phrasebook_offset = [0] * len(unicode.chars)
535 for char in unicode.chars:
536 name = names[char]
537 if name:
538 w = name.split()
539 phrasebook_offset[char] = len(phrasebook)
540 for w in w:
541 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000542 if i < short:
543 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000544 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000545 # store as two bytes
546 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000547 phrasebook.append(i&255)
548
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000549 assert getsize(phrasebook) == 1
550
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000551 #
552 # unicode name hash table
553
554 # extract names
555 data = []
556 for char in unicode.chars:
557 record = unicode.table[char]
558 if record:
559 name = record[1].strip()
560 if name and name[0] != "<":
561 data.append((name, char))
562
563 # the magic number 47 was chosen to minimize the number of
564 # collisions on the current data set. if you like, change it
565 # and see what happens...
566
567 codehash = Hash("code", data, 47)
568
569 print "--- Writing", FILE, "..."
570
571 fp = open(FILE, "w")
572 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
573 print >>fp
574 print >>fp, "#define NAME_MAXLEN", 256
575 print >>fp
576 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000577 Array("lexicon", lexicon).dump(fp, trace)
578 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000579
580 # split decomposition index table
581 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
582
583 print >>fp, "/* code->name phrasebook */"
584 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000585 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000586
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000587 Array("phrasebook", phrasebook).dump(fp, trace)
588 Array("phrasebook_offset1", offset1).dump(fp, trace)
589 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000590
591 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000592 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000593
594 fp.close()
595
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000596
597def merge_old_version(version, new, old):
598 # Changes to exclusion file not implemented yet
599 if old.exclusions != new.exclusions:
600 raise NotImplementedError, "exclusions differ"
601
602 # In these change records, 0xFF means "no change"
603 bidir_changes = [0xFF]*0x110000
604 category_changes = [0xFF]*0x110000
605 decimal_changes = [0xFF]*0x110000
606 # In numeric data, 0 means "no change",
607 # -1 means "did not have a numeric value
608 numeric_changes = [0] * 0x110000
609 # normalization_changes is a list of key-value pairs
610 normalization_changes = []
611 for i in range(0x110000):
612 if new.table[i] is None:
613 # Characters unassigned in the new version ought to
614 # be unassigned in the old one
615 assert old.table[i] is None
616 continue
617 # check characters unassigned in the old version
618 if old.table[i] is None:
619 # category 0 is "unassigned"
620 category_changes[i] = 0
621 continue
622 # check characters that differ
623 if old.table[i] != new.table[i]:
624 for k in range(len(old.table[i])):
625 if old.table[i][k] != new.table[i][k]:
626 value = old.table[i][k]
627 if k == 2:
628 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629 category_changes[i] = CATEGORY_NAMES.index(value)
630 elif k == 4:
631 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
633 elif k == 5:
634 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635 # We assume that all normalization changes are in 1:1 mappings
636 assert " " not in value
637 normalization_changes.append((i, value))
638 elif k == 6:
639 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640 # we only support changes where the old value is a single digit
641 assert value in "0123456789"
642 decimal_changes[i] = int(value)
643 elif k == 8:
644 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645 # Since 0 encodes "no change", the old value is better not 0
646 assert value != "0" and value != "-1"
647 if not value:
648 numeric_changes[i] = -1
649 else:
650 assert re.match("^[0-9]+$", value)
651 numeric_changes[i] = int(value)
652 elif k == 11:
653 # change to ISO comment, ignore
654 pass
655 elif k == 12:
656 # change to simple uppercase mapping; ignore
657 pass
658 elif k == 13:
659 # change to simple lowercase mapping; ignore
660 pass
661 elif k == 14:
662 # change to simple titlecase mapping; ignore
663 pass
664 else:
665 class Difference(Exception):pass
666 raise Difference, (hex(i), k, old.table[i], new.table[i])
667 new.changed.append((version, zip(bidir_changes, category_changes,
668 decimal_changes, numeric_changes),
669 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000670
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000671
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000672# --------------------------------------------------------------------
673# the following support code is taken from the unidb utilities
674# Copyright (c) 1999-2000 by Secret Labs AB
675
676# load a unicode-data file from disk
677
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000678import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000679
680class UnicodeData:
681
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000682 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000683 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000684 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000685 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000686 while 1:
687 s = file.readline()
688 if not s:
689 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000690 s = s.strip().split(";")
691 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000692 table[char] = s
693
Martin v. Löwis97225da2002-11-24 23:05:09 +0000694 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000695 if expand:
696 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000697 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000698 s = table[i]
699 if s:
700 if s[1][-6:] == "First>":
701 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000703 elif s[1][-5:] == "Last>":
704 s[1] = ""
705 field = None
706 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000707 f2 = field[:]
708 f2[0] = "%X" % i
709 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000710
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000711 # public attributes
712 self.filename = filename
713 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000714 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000715
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 file = open(exclusions)
717 self.exclusions = {}
718 for s in file:
719 s = s.strip()
720 if not s:
721 continue
722 if s[0] == '#':
723 continue
724 char = int(s.split()[0],16)
725 self.exclusions[char] = 1
726
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000727 widths = [None] * 0x110000
728 for s in open(eastasianwidth):
729 s = s.strip()
730 if not s:
731 continue
732 if s[0] == '#':
733 continue
734 s = s.split()[0].split(';')
735 if '..' in s[0]:
736 first, last = [int(c, 16) for c in s[0].split('..')]
737 chars = range(first, last+1)
738 else:
739 chars = [int(s[0], 16)]
740 for char in chars:
741 widths[char] = s[1]
742 for i in range(0, 0x110000):
743 if table[i] is not None:
744 table[i].append(widths[i])
745
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000746 def uselatin1(self):
747 # restrict character range to ISO Latin 1
748 self.chars = range(256)
749
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000750# hash table tools
751
752# this is a straight-forward reimplementation of Python's built-in
753# dictionary type, using a static data structure, and a custom string
754# hash algorithm.
755
756def myhash(s, magic):
757 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000758 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000759 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000760 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000761 if ix:
762 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
763 return h
764
765SIZES = [
766 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
767 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
768 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
769 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
770]
771
772class Hash:
773 def __init__(self, name, data, magic):
774 # turn a (key, value) list into a static hash table structure
775
776 # determine table size
777 for size, poly in SIZES:
778 if size > len(data):
779 poly = size + poly
780 break
781 else:
782 raise AssertionError, "ran out of polynominals"
783
784 print size, "slots in hash table"
785
786 table = [None] * size
787
788 mask = size-1
789
790 n = 0
791
792 hash = myhash
793
794 # initialize hash table
795 for key, value in data:
796 h = hash(key, magic)
797 i = (~h) & mask
798 v = table[i]
799 if v is None:
800 table[i] = value
801 continue
802 incr = (h ^ (h >> 3)) & mask;
803 if not incr:
804 incr = mask
805 while 1:
806 n = n + 1
807 i = (i + incr) & mask
808 v = table[i]
809 if v is None:
810 table[i] = value
811 break
812 incr = incr << 1
813 if incr > mask:
814 incr = incr ^ poly
815
816 print n, "collisions"
817 self.collisions = n
818
819 for i in range(len(table)):
820 if table[i] is None:
821 table[i] = 0
822
823 self.data = Array(name + "_hash", table)
824 self.magic = magic
825 self.name = name
826 self.size = size
827 self.poly = poly
828
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000829 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000830 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000831 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000832 file.write("#define %s_magic %d\n" % (self.name, self.magic))
833 file.write("#define %s_size %d\n" % (self.name, self.size))
834 file.write("#define %s_poly %d\n" % (self.name, self.poly))
835
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000836# stuff to deal with arrays of unsigned integers
837
838class Array:
839
840 def __init__(self, name, data):
841 self.name = name
842 self.data = data
843
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000844 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000845 # write data to file, as a C array
846 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000847 if trace:
848 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000849 file.write("static ")
850 if size == 1:
851 file.write("unsigned char")
852 elif size == 2:
853 file.write("unsigned short")
854 else:
855 file.write("unsigned int")
856 file.write(" " + self.name + "[] = {\n")
857 if self.data:
858 s = " "
859 for item in self.data:
860 i = str(item) + ", "
861 if len(s) + len(i) > 78:
862 file.write(s + "\n")
863 s = " " + i
864 else:
865 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000866 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000867 file.write(s + "\n")
868 file.write("};\n\n")
869
870def getsize(data):
871 # return smallest possible integer size for the given array
872 maxdata = max(data)
873 if maxdata < 256:
874 return 1
875 elif maxdata < 65536:
876 return 2
877 else:
878 return 4
879
Tim Peters21013482000-09-25 07:13:41 +0000880def splitbins(t, trace=0):
881 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
882
883 t is a sequence of ints. This function can be useful to save space if
884 many of the ints are the same. t1 and t2 are lists of ints, and shift
885 is an int, chosen to minimize the combined size of t1 and t2 (in C
886 code), and where for each i in range(len(t)),
887 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
888 where mask is a bitmask isolating the last "shift" bits.
889
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000890 If optional arg trace is non-zero (default zero), progress info
891 is printed to sys.stderr. The higher the value, the more info
892 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000893 """
894
895 import sys
896 if trace:
897 def dump(t1, t2, shift, bytes):
898 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
899 len(t1), len(t2), shift, bytes)
900 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
901 "bytes"
902 n = len(t)-1 # last valid index
903 maxshift = 0 # the most we can shift n and still have something left
904 if n > 0:
905 while n >> 1:
906 n >>= 1
907 maxshift += 1
908 del n
909 bytes = sys.maxint # smallest total size so far
910 t = tuple(t) # so slices can be dict keys
911 for shift in range(maxshift + 1):
912 t1 = []
913 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000914 size = 2**shift
915 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000916 for i in range(0, len(t), size):
917 bin = t[i:i+size]
918 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000919 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000920 index = len(t2)
921 bincache[bin] = index
922 t2.extend(bin)
923 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000924 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000925 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000926 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000927 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000928 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000929 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000930 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000931 t1, t2, shift = best
932 if trace:
933 print >>sys.stderr, "Best:",
934 dump(t1, t2, shift, bytes)
935 if __debug__:
936 # exhaustively verify that the decomposition is correct
937 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +0000938 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +0000939 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
940 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000941
942if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000943 maketables(1)