blob: 41df819451e40ba2bad7aa832c9fddf264da2df0 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Fredrik Lundhcfcea492000-09-25 08:07:06 +000022#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000023# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000024#
25
26import sys
27
28SCRIPT = sys.argv[0]
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000029VERSION = "2.5"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000030
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000031# The Unicode Database
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032UNIDATA_VERSION = "4.1.0"
33UNICODE_DATA = "UnicodeData%s.txt"
34COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
35EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
36
37old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
39CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
40 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
41 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
42 "So" ]
43
44BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
45 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
46 "ON" ]
47
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000048EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
49
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000050# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000051ALPHA_MASK = 0x01
52DECIMAL_MASK = 0x02
53DIGIT_MASK = 0x04
54LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000055LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000056SPACE_MASK = 0x20
57TITLE_MASK = 0x40
58UPPER_MASK = 0x80
59
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000060def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000061
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000062 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000063
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000064 version = ""
65 unicode = UnicodeData(UNICODE_DATA % version,
66 COMPOSITION_EXCLUSIONS % version,
67 EASTASIAN_WIDTH % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000069 print len(filter(None, unicode.table)), "characters"
70
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 for version in old_versions:
72 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
73 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
74 COMPOSITION_EXCLUSIONS % ("-"+version),
75 EASTASIAN_WIDTH % ("-"+version))
76 print len(filter(None, old_unicode.table)), "characters"
77 merge_old_version(version, unicode, old_unicode)
78
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000079 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000080 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000081 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000082
83# --------------------------------------------------------------------
84# unicode character properties
85
86def makeunicodedata(unicode, trace):
87
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000088 dummy = (0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000089 table = [dummy]
90 cache = {0: dummy}
91 index = [0] * len(unicode.chars)
92
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000093 FILE = "Modules/unicodedata_db.h"
94
95 print "--- Preparing", FILE, "..."
96
Fredrik Lundhcfcea492000-09-25 08:07:06 +000097 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000098
Fredrik Lundhf367cac2000-09-24 23:18:31 +000099 for char in unicode.chars:
100 record = unicode.table[char]
101 if record:
102 # extract database properties
103 category = CATEGORY_NAMES.index(record[2])
104 combining = int(record[3])
105 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
106 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000107 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 item = (
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000109 category, combining, bidirectional, mirrored, eastasianwidth
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110 )
111 # add entry to index and item tables
112 i = cache.get(item)
113 if i is None:
114 cache[item] = i = len(table)
115 table.append(item)
116 index[char] = i
117
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000118 # 2) decomposition data
119
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000120 decomp_data = [0]
121 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000122 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000123 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000124
Martin v. Löwis677bde22002-11-23 22:08:15 +0000125 comp_pairs = []
126 comp_first = [None] * len(unicode.chars)
127 comp_last = [None] * len(unicode.chars)
128
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000129 for char in unicode.chars:
130 record = unicode.table[char]
131 if record:
132 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000133 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if len(decomp) > 19:
135 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000136 # prefix
137 if decomp[0][0] == "<":
138 prefix = decomp.pop(0)
139 else:
140 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000142 i = decomp_prefix.index(prefix)
143 except ValueError:
144 i = len(decomp_prefix)
145 decomp_prefix.append(prefix)
146 prefix = i
147 assert prefix < 256
148 # content
149 decomp = [prefix + (len(decomp)<<8)] +\
150 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000151 # Collect NFC pairs
152 if not prefix and len(decomp) == 3 and \
153 char not in unicode.exclusions and \
154 unicode.table[decomp[1]][3] == "0":
155 p, l, r = decomp
156 comp_first[l] = 1
157 comp_last[r] = 1
158 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000159 try:
160 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000161 except ValueError:
162 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 decomp_data.extend(decomp)
164 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000165 else:
166 i = 0
167 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000168
Martin v. Löwis677bde22002-11-23 22:08:15 +0000169 f = l = 0
170 comp_first_ranges = []
171 comp_last_ranges = []
172 prev_f = prev_l = None
173 for i in unicode.chars:
174 if comp_first[i] is not None:
175 comp_first[i] = f
176 f += 1
177 if prev_f is None:
178 prev_f = (i,i)
179 elif prev_f[1]+1 == i:
180 prev_f = prev_f[0],i
181 else:
182 comp_first_ranges.append(prev_f)
183 prev_f = (i,i)
184 if comp_last[i] is not None:
185 comp_last[i] = l
186 l += 1
187 if prev_l is None:
188 prev_l = (i,i)
189 elif prev_l[1]+1 == i:
190 prev_l = prev_l[0],i
191 else:
192 comp_last_ranges.append(prev_l)
193 prev_l = (i,i)
194 comp_first_ranges.append(prev_f)
195 comp_last_ranges.append(prev_l)
196 total_first = f
197 total_last = l
198
199 comp_data = [0]*(total_first*total_last)
200 for f,l,char in comp_pairs:
201 f = comp_first[f]
202 l = comp_last[l]
203 comp_data[f*total_last+l] = char
204
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000205 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000206 print len(decomp_prefix), "unique decomposition prefixes"
207 print len(decomp_data), "unique decomposition entries:",
208 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000209 print total_first, "first characters in NFC"
210 print total_last, "last characters in NFC"
211 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000212
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000213 print "--- Writing", FILE, "..."
214
Fred Drake9c685052000-10-26 03:56:46 +0000215 fp = open(FILE, "w")
216 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
217 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000218 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000219 print >>fp, "/* a list of unique database records */"
220 print >>fp, \
221 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000222 for item in table:
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000223 print >>fp, " {%d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000224 print >>fp, "};"
225 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000226
Martin v. Löwis677bde22002-11-23 22:08:15 +0000227 print >>fp, "/* Reindexing of NFC first characters. */"
228 print >>fp, "#define TOTAL_FIRST",total_first
229 print >>fp, "#define TOTAL_LAST",total_last
230 print >>fp, "struct reindex{int start;short count,index;};"
231 print >>fp, "struct reindex nfc_first[] = {"
232 for start,end in comp_first_ranges:
233 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
234 print >>fp," {0,0,0}"
235 print >>fp,"};\n"
236 print >>fp, "struct reindex nfc_last[] = {"
237 for start,end in comp_last_ranges:
238 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
239 print >>fp," {0,0,0}"
240 print >>fp,"};\n"
241
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000242 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000243 # the support code moved into unicodedatabase.c
244
Fred Drake9c685052000-10-26 03:56:46 +0000245 print >>fp, "/* string literals */"
246 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000247 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000248 print >>fp, " \"%s\"," % name
249 print >>fp, " NULL"
250 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000251
Fred Drake9c685052000-10-26 03:56:46 +0000252 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000253 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000254 print >>fp, " \"%s\"," % name
255 print >>fp, " NULL"
256 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000258 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
259 for name in EASTASIANWIDTH_NAMES:
260 print >>fp, " \"%s\"," % name
261 print >>fp, " NULL"
262 print >>fp, "};"
263
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000264 print >>fp, "static const char *decomp_prefix[] = {"
265 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000266 print >>fp, " \"%s\"," % name
267 print >>fp, " NULL"
268 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000269
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000270 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000271 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Fred Drake9c685052000-10-26 03:56:46 +0000273 print >>fp, "/* index tables for the database records */"
274 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000275 Array("index1", index1).dump(fp, trace)
276 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000277
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000278 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000279 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000280
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281 print >>fp, "/* decomposition data */"
282 Array("decomp_data", decomp_data).dump(fp, trace)
283
Fred Drake9c685052000-10-26 03:56:46 +0000284 print >>fp, "/* index tables for the decomposition data */"
285 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000286 Array("decomp_index1", index1).dump(fp, trace)
287 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000288
Martin v. Löwis677bde22002-11-23 22:08:15 +0000289 index, index2, shift = splitbins(comp_data, trace)
290 print >>fp, "/* NFC pairs */"
291 print >>fp, "#define COMP_SHIFT", shift
292 Array("comp_index", index).dump(fp, trace)
293 Array("comp_data", index2).dump(fp, trace)
294
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 # Generate delta tables for old versions
296 for version, table, normalization in unicode.changed:
297 cversion = version.replace(".","_")
298 records = [table[0]]
299 cache = {table[0]:0}
300 index = [0] * len(table)
301 for i, record in enumerate(table):
302 try:
303 index[i] = cache[record]
304 except KeyError:
305 index[i] = cache[record] = len(records)
306 records.append(record)
307 index1, index2, shift = splitbins(index, trace)
308 print >>fp, "static const change_record change_records_%s[] = {" % cversion
309 for record in records:
310 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
311 print >>fp, "};"
312 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
313 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
314 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
315 print >>fp, "{"
316 print >>fp, "\tint index;"
317 print >>fp, "\tif (n >= 0x110000) index = 0;"
318 print >>fp, "\telse {"
319 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
320 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
321 (cversion, shift, ((1<<shift)-1))
322 print >>fp, "\t}"
323 print >>fp, "\treturn change_records_%s+index;" % cversion
324 print >>fp, "}\n"
325 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
326 print >>fp, "{"
327 print >>fp, "\tswitch(n) {"
328 for k, v in normalization:
329 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
330 print >>fp, "\tdefault: return 0;"
331 print >>fp, "\t}\n}\n"
332
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000333 fp.close()
334
335# --------------------------------------------------------------------
336# unicode character type tables
337
338def makeunicodetype(unicode, trace):
339
340 FILE = "Objects/unicodetype_db.h"
341
342 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000343
344 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000345 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000346 table = [dummy]
347 cache = {0: dummy}
348 index = [0] * len(unicode.chars)
349
350 for char in unicode.chars:
351 record = unicode.table[char]
352 if record:
353 # extract database properties
354 category = record[2]
355 bidirectional = record[4]
356 flags = 0
357 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
358 flags |= ALPHA_MASK
359 if category == "Ll":
360 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000361 if category == "Zl" or bidirectional == "B":
362 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000363 if category == "Zs" or bidirectional in ("WS", "B", "S"):
364 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000365 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000366 flags |= TITLE_MASK
367 if category == "Lu":
368 flags |= UPPER_MASK
369 # use delta predictor for upper/lower/title
370 if record[12]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000371 upper = int(record[12], 16) - char
372 assert -32768 <= upper <= 32767
373 upper = upper & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000374 else:
375 upper = 0
376 if record[13]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000377 lower = int(record[13], 16) - char
378 assert -32768 <= lower <= 32767
379 lower = lower & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000380 else:
381 lower = 0
382 if record[14]:
Martin v. Löwis99ac3282002-10-18 17:34:18 +0000383 title = int(record[14], 16) - char
384 assert -32768 <= lower <= 32767
385 title = title & 0xffff
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 else:
387 title = 0
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000388 # decimal digit, integer digit
389 decimal = 0
390 if record[6]:
391 flags |= DECIMAL_MASK
392 decimal = int(record[6])
393 digit = 0
394 if record[7]:
395 flags |= DIGIT_MASK
396 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000398 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000399 )
400 # add entry to index and item tables
401 i = cache.get(item)
402 if i is None:
403 cache[item] = i = len(table)
404 table.append(item)
405 index[char] = i
406
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000407 print len(table), "unique character type entries"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000409 print "--- Writing", FILE, "..."
410
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000411 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000412 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
413 print >>fp
414 print >>fp, "/* a list of unique character type descriptors */"
415 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000417 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
418 print >>fp, "};"
419 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420
421 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000422 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000423
Fred Drake9c685052000-10-26 03:56:46 +0000424 print >>fp, "/* type indexes */"
425 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 Array("index1", index1).dump(fp, trace)
427 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000429 fp.close()
430
431# --------------------------------------------------------------------
432# unicode name database
433
434def makeunicodename(unicode, trace):
435
436 FILE = "Modules/unicodename_db.h"
437
438 print "--- Preparing", FILE, "..."
439
440 # collect names
441 names = [None] * len(unicode.chars)
442
443 for char in unicode.chars:
444 record = unicode.table[char]
445 if record:
446 name = record[1].strip()
447 if name and name[0] != "<":
448 names[char] = name + chr(0)
449
450 print len(filter(lambda n: n is not None, names)), "distinct names"
451
452 # collect unique words from names (note that we differ between
453 # words inside a sentence, and words ending a sentence. the
454 # latter includes the trailing null byte.
455
456 words = {}
457 n = b = 0
458 for char in unicode.chars:
459 name = names[char]
460 if name:
461 w = name.split()
462 b = b + len(name)
463 n = n + len(w)
464 for w in w:
465 l = words.get(w)
466 if l:
467 l.append(None)
468 else:
469 words[w] = [len(words)]
470
471 print n, "words in text;", b, "bytes"
472
473 wordlist = words.items()
474
Martin v. Löwis97225da2002-11-24 23:05:09 +0000475 # sort on falling frequency, then by name
476 def cmpwords((aword, alist),(bword, blist)):
477 r = -cmp(len(alist),len(blist))
478 if r:
479 return r
480 return cmp(aword, bword)
481 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000482
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000483 # figure out how many phrasebook escapes we need
484 escapes = 0
485 while escapes * 256 < len(wordlist):
486 escapes = escapes + 1
487 print escapes, "escapes"
488
489 short = 256 - escapes
490
491 assert short > 0
492
493 print short, "short indexes in lexicon"
494
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000495 # statistics
496 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000497 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000498 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000499 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000500
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000501 # pick the most commonly used words, and sort the rest on falling
502 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000504 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000505 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
506 wordlist.extend(wordtail)
507
508 # generate lexicon from words
509
510 lexicon_offset = [0]
511 lexicon = ""
512 words = {}
513
514 # build a lexicon string
515 offset = 0
516 for w, x in wordlist:
517 # encoding: bit 7 indicates last character in word (chr(128)
518 # indicates the last character in an entire string)
519 ww = w[:-1] + chr(ord(w[-1])+128)
520 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000521 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000522 if o < 0:
523 o = offset
524 lexicon = lexicon + ww
525 offset = offset + len(w)
526 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000527 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000528
529 lexicon = map(ord, lexicon)
530
531 # generate phrasebook from names and lexicon
532 phrasebook = [0]
533 phrasebook_offset = [0] * len(unicode.chars)
534 for char in unicode.chars:
535 name = names[char]
536 if name:
537 w = name.split()
538 phrasebook_offset[char] = len(phrasebook)
539 for w in w:
540 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000541 if i < short:
542 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000544 # store as two bytes
545 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000546 phrasebook.append(i&255)
547
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000548 assert getsize(phrasebook) == 1
549
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000550 #
551 # unicode name hash table
552
553 # extract names
554 data = []
555 for char in unicode.chars:
556 record = unicode.table[char]
557 if record:
558 name = record[1].strip()
559 if name and name[0] != "<":
560 data.append((name, char))
561
562 # the magic number 47 was chosen to minimize the number of
563 # collisions on the current data set. if you like, change it
564 # and see what happens...
565
566 codehash = Hash("code", data, 47)
567
568 print "--- Writing", FILE, "..."
569
570 fp = open(FILE, "w")
571 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
572 print >>fp
573 print >>fp, "#define NAME_MAXLEN", 256
574 print >>fp
575 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000576 Array("lexicon", lexicon).dump(fp, trace)
577 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000578
579 # split decomposition index table
580 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
581
582 print >>fp, "/* code->name phrasebook */"
583 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000584 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000585
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000586 Array("phrasebook", phrasebook).dump(fp, trace)
587 Array("phrasebook_offset1", offset1).dump(fp, trace)
588 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589
590 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000591 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592
593 fp.close()
594
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000595
596def merge_old_version(version, new, old):
597 # Changes to exclusion file not implemented yet
598 if old.exclusions != new.exclusions:
599 raise NotImplementedError, "exclusions differ"
600
601 # In these change records, 0xFF means "no change"
602 bidir_changes = [0xFF]*0x110000
603 category_changes = [0xFF]*0x110000
604 decimal_changes = [0xFF]*0x110000
605 # In numeric data, 0 means "no change",
606 # -1 means "did not have a numeric value
607 numeric_changes = [0] * 0x110000
608 # normalization_changes is a list of key-value pairs
609 normalization_changes = []
610 for i in range(0x110000):
611 if new.table[i] is None:
612 # Characters unassigned in the new version ought to
613 # be unassigned in the old one
614 assert old.table[i] is None
615 continue
616 # check characters unassigned in the old version
617 if old.table[i] is None:
618 # category 0 is "unassigned"
619 category_changes[i] = 0
620 continue
621 # check characters that differ
622 if old.table[i] != new.table[i]:
623 for k in range(len(old.table[i])):
624 if old.table[i][k] != new.table[i][k]:
625 value = old.table[i][k]
626 if k == 2:
627 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
628 category_changes[i] = CATEGORY_NAMES.index(value)
629 elif k == 4:
630 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
631 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
632 elif k == 5:
633 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
634 # We assume that all normalization changes are in 1:1 mappings
635 assert " " not in value
636 normalization_changes.append((i, value))
637 elif k == 6:
638 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
639 # we only support changes where the old value is a single digit
640 assert value in "0123456789"
641 decimal_changes[i] = int(value)
642 elif k == 8:
643 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
644 # Since 0 encodes "no change", the old value is better not 0
645 assert value != "0" and value != "-1"
646 if not value:
647 numeric_changes[i] = -1
648 else:
649 assert re.match("^[0-9]+$", value)
650 numeric_changes[i] = int(value)
651 elif k == 11:
652 # change to ISO comment, ignore
653 pass
654 elif k == 12:
655 # change to simple uppercase mapping; ignore
656 pass
657 elif k == 13:
658 # change to simple lowercase mapping; ignore
659 pass
660 elif k == 14:
661 # change to simple titlecase mapping; ignore
662 pass
663 else:
664 class Difference(Exception):pass
665 raise Difference, (hex(i), k, old.table[i], new.table[i])
666 new.changed.append((version, zip(bidir_changes, category_changes,
667 decimal_changes, numeric_changes),
668 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000669
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000670
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000671# --------------------------------------------------------------------
672# the following support code is taken from the unidb utilities
673# Copyright (c) 1999-2000 by Secret Labs AB
674
675# load a unicode-data file from disk
676
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000677import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000678
679class UnicodeData:
680
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000681 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000682 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000683 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000684 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000685 while 1:
686 s = file.readline()
687 if not s:
688 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000689 s = s.strip().split(";")
690 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000691 table[char] = s
692
Martin v. Löwis97225da2002-11-24 23:05:09 +0000693 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000694 if expand:
695 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000696 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000697 s = table[i]
698 if s:
699 if s[1][-6:] == "First>":
700 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000701 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000702 elif s[1][-5:] == "Last>":
703 s[1] = ""
704 field = None
705 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000706 f2 = field[:]
707 f2[0] = "%X" % i
708 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000709
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000710 # public attributes
711 self.filename = filename
712 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000713 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000714
Martin v. Löwis677bde22002-11-23 22:08:15 +0000715 file = open(exclusions)
716 self.exclusions = {}
717 for s in file:
718 s = s.strip()
719 if not s:
720 continue
721 if s[0] == '#':
722 continue
723 char = int(s.split()[0],16)
724 self.exclusions[char] = 1
725
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000726 widths = [None] * 0x110000
727 for s in open(eastasianwidth):
728 s = s.strip()
729 if not s:
730 continue
731 if s[0] == '#':
732 continue
733 s = s.split()[0].split(';')
734 if '..' in s[0]:
735 first, last = [int(c, 16) for c in s[0].split('..')]
736 chars = range(first, last+1)
737 else:
738 chars = [int(s[0], 16)]
739 for char in chars:
740 widths[char] = s[1]
741 for i in range(0, 0x110000):
742 if table[i] is not None:
743 table[i].append(widths[i])
744
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000745 def uselatin1(self):
746 # restrict character range to ISO Latin 1
747 self.chars = range(256)
748
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000749# hash table tools
750
751# this is a straight-forward reimplementation of Python's built-in
752# dictionary type, using a static data structure, and a custom string
753# hash algorithm.
754
755def myhash(s, magic):
756 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000757 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000758 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000759 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000760 if ix:
761 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
762 return h
763
764SIZES = [
765 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
766 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
767 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
768 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
769]
770
771class Hash:
772 def __init__(self, name, data, magic):
773 # turn a (key, value) list into a static hash table structure
774
775 # determine table size
776 for size, poly in SIZES:
777 if size > len(data):
778 poly = size + poly
779 break
780 else:
781 raise AssertionError, "ran out of polynominals"
782
783 print size, "slots in hash table"
784
785 table = [None] * size
786
787 mask = size-1
788
789 n = 0
790
791 hash = myhash
792
793 # initialize hash table
794 for key, value in data:
795 h = hash(key, magic)
796 i = (~h) & mask
797 v = table[i]
798 if v is None:
799 table[i] = value
800 continue
801 incr = (h ^ (h >> 3)) & mask;
802 if not incr:
803 incr = mask
804 while 1:
805 n = n + 1
806 i = (i + incr) & mask
807 v = table[i]
808 if v is None:
809 table[i] = value
810 break
811 incr = incr << 1
812 if incr > mask:
813 incr = incr ^ poly
814
815 print n, "collisions"
816 self.collisions = n
817
818 for i in range(len(table)):
819 if table[i] is None:
820 table[i] = 0
821
822 self.data = Array(name + "_hash", table)
823 self.magic = magic
824 self.name = name
825 self.size = size
826 self.poly = poly
827
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000828 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000829 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000830 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000831 file.write("#define %s_magic %d\n" % (self.name, self.magic))
832 file.write("#define %s_size %d\n" % (self.name, self.size))
833 file.write("#define %s_poly %d\n" % (self.name, self.poly))
834
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000835# stuff to deal with arrays of unsigned integers
836
837class Array:
838
839 def __init__(self, name, data):
840 self.name = name
841 self.data = data
842
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000843 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000844 # write data to file, as a C array
845 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000846 if trace:
847 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000848 file.write("static ")
849 if size == 1:
850 file.write("unsigned char")
851 elif size == 2:
852 file.write("unsigned short")
853 else:
854 file.write("unsigned int")
855 file.write(" " + self.name + "[] = {\n")
856 if self.data:
857 s = " "
858 for item in self.data:
859 i = str(item) + ", "
860 if len(s) + len(i) > 78:
861 file.write(s + "\n")
862 s = " " + i
863 else:
864 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000865 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000866 file.write(s + "\n")
867 file.write("};\n\n")
868
869def getsize(data):
870 # return smallest possible integer size for the given array
871 maxdata = max(data)
872 if maxdata < 256:
873 return 1
874 elif maxdata < 65536:
875 return 2
876 else:
877 return 4
878
Tim Peters21013482000-09-25 07:13:41 +0000879def splitbins(t, trace=0):
880 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
881
882 t is a sequence of ints. This function can be useful to save space if
883 many of the ints are the same. t1 and t2 are lists of ints, and shift
884 is an int, chosen to minimize the combined size of t1 and t2 (in C
885 code), and where for each i in range(len(t)),
886 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
887 where mask is a bitmask isolating the last "shift" bits.
888
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000889 If optional arg trace is non-zero (default zero), progress info
890 is printed to sys.stderr. The higher the value, the more info
891 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000892 """
893
894 import sys
895 if trace:
896 def dump(t1, t2, shift, bytes):
897 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
898 len(t1), len(t2), shift, bytes)
899 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
900 "bytes"
901 n = len(t)-1 # last valid index
902 maxshift = 0 # the most we can shift n and still have something left
903 if n > 0:
904 while n >> 1:
905 n >>= 1
906 maxshift += 1
907 del n
908 bytes = sys.maxint # smallest total size so far
909 t = tuple(t) # so slices can be dict keys
910 for shift in range(maxshift + 1):
911 t1 = []
912 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000913 size = 2**shift
914 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +0000915 for i in range(0, len(t), size):
916 bin = t[i:i+size]
917 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918 if index is None:
Tim Peters21013482000-09-25 07:13:41 +0000919 index = len(t2)
920 bincache[bin] = index
921 t2.extend(bin)
922 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +0000924 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000925 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +0000926 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000927 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +0000928 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000929 bytes = b
Tim Peters21013482000-09-25 07:13:41 +0000930 t1, t2, shift = best
931 if trace:
932 print >>sys.stderr, "Best:",
933 dump(t1, t2, shift, bytes)
934 if __debug__:
935 # exhaustively verify that the decomposition is correct
936 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
937 for i in xrange(len(t)):
938 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
939 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000940
941if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000942 maketables(1)