blob: a97d4caf58f0262d0d7f70e99ffdd8f350840c76 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Florent Xicluna2e0a53f2010-03-18 21:50:06 +000033UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000037UNIHAN = "Unihan%s.txt"
Antoine Pitroue988e282009-04-27 21:53:26 +000038DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000062NODELTA_MASK = 0x100
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000063NUMERIC_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000065def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000066
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069 version = ""
70 unicode = UnicodeData(UNICODE_DATA % version,
71 COMPOSITION_EXCLUSIONS % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000072 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000073 UNIHAN % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000074 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000075
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000076 print len(filter(None, unicode.table)), "characters"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 for version in old_versions:
79 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
80 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
81 COMPOSITION_EXCLUSIONS % ("-"+version),
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000082 EASTASIAN_WIDTH % ("-"+version),
83 UNIHAN % ("-"+version))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 print len(filter(None, old_unicode.table)), "characters"
85 merge_old_version(version, unicode, old_unicode)
86
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000089 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000090
91# --------------------------------------------------------------------
92# unicode character properties
93
94def makeunicodedata(unicode, trace):
95
Antoine Pitroue988e282009-04-27 21:53:26 +000096 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097 table = [dummy]
98 cache = {0: dummy}
99 index = [0] * len(unicode.chars)
100
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101 FILE = "Modules/unicodedata_db.h"
102
103 print "--- Preparing", FILE, "..."
104
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000105 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 # extract database properties
111 category = CATEGORY_NAMES.index(record[2])
112 combining = int(record[3])
113 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
114 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitroue988e282009-04-27 21:53:26 +0000116 normalizationquickcheck = record[16]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 item = (
Antoine Pitroue988e282009-04-27 21:53:26 +0000118 category, combining, bidirectional, mirrored, eastasianwidth,
119 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000120 )
121 # add entry to index and item tables
122 i = cache.get(item)
123 if i is None:
124 cache[item] = i = len(table)
125 table.append(item)
126 index[char] = i
127
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 # 2) decomposition data
129
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 decomp_data = [0]
131 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000133 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000134
Martin v. Löwis677bde22002-11-23 22:08:15 +0000135 comp_pairs = []
136 comp_first = [None] * len(unicode.chars)
137 comp_last = [None] * len(unicode.chars)
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000143 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000144 if len(decomp) > 19:
145 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000146 # prefix
147 if decomp[0][0] == "<":
148 prefix = decomp.pop(0)
149 else:
150 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000151 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 i = decomp_prefix.index(prefix)
153 except ValueError:
154 i = len(decomp_prefix)
155 decomp_prefix.append(prefix)
156 prefix = i
157 assert prefix < 256
158 # content
Florent Xiclunadc364722010-03-15 14:00:58 +0000159 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000160 # Collect NFC pairs
161 if not prefix and len(decomp) == 3 and \
162 char not in unicode.exclusions and \
163 unicode.table[decomp[1]][3] == "0":
164 p, l, r = decomp
165 comp_first[l] = 1
166 comp_last[r] = 1
167 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 try:
169 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 except ValueError:
171 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 decomp_data.extend(decomp)
173 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174 else:
175 i = 0
176 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000177
Martin v. Löwis677bde22002-11-23 22:08:15 +0000178 f = l = 0
179 comp_first_ranges = []
180 comp_last_ranges = []
181 prev_f = prev_l = None
182 for i in unicode.chars:
183 if comp_first[i] is not None:
184 comp_first[i] = f
185 f += 1
186 if prev_f is None:
187 prev_f = (i,i)
188 elif prev_f[1]+1 == i:
189 prev_f = prev_f[0],i
190 else:
191 comp_first_ranges.append(prev_f)
192 prev_f = (i,i)
193 if comp_last[i] is not None:
194 comp_last[i] = l
195 l += 1
196 if prev_l is None:
197 prev_l = (i,i)
198 elif prev_l[1]+1 == i:
199 prev_l = prev_l[0],i
200 else:
201 comp_last_ranges.append(prev_l)
202 prev_l = (i,i)
203 comp_first_ranges.append(prev_f)
204 comp_last_ranges.append(prev_l)
205 total_first = f
206 total_last = l
207
208 comp_data = [0]*(total_first*total_last)
209 for f,l,char in comp_pairs:
210 f = comp_first[f]
211 l = comp_last[l]
212 comp_data[f*total_last+l] = char
213
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000214 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215 print len(decomp_prefix), "unique decomposition prefixes"
216 print len(decomp_data), "unique decomposition entries:",
217 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000218 print total_first, "first characters in NFC"
219 print total_last, "last characters in NFC"
220 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000221
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000222 print "--- Writing", FILE, "..."
223
Fred Drake9c685052000-10-26 03:56:46 +0000224 fp = open(FILE, "w")
225 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
226 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000227 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000228 print >>fp, "/* a list of unique database records */"
229 print >>fp, \
230 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000231 for item in table:
Antoine Pitroue988e282009-04-27 21:53:26 +0000232 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000233 print >>fp, "};"
234 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000235
Martin v. Löwis677bde22002-11-23 22:08:15 +0000236 print >>fp, "/* Reindexing of NFC first characters. */"
237 print >>fp, "#define TOTAL_FIRST",total_first
238 print >>fp, "#define TOTAL_LAST",total_last
239 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000240 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000241 for start,end in comp_first_ranges:
242 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
243 print >>fp," {0,0,0}"
244 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000245 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000246 for start,end in comp_last_ranges:
247 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
248 print >>fp," {0,0,0}"
249 print >>fp,"};\n"
250
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000251 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000252 # the support code moved into unicodedatabase.c
253
Fred Drake9c685052000-10-26 03:56:46 +0000254 print >>fp, "/* string literals */"
255 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000256 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000257 print >>fp, " \"%s\"," % name
258 print >>fp, " NULL"
259 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260
Fred Drake9c685052000-10-26 03:56:46 +0000261 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000263 print >>fp, " \"%s\"," % name
264 print >>fp, " NULL"
265 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000267 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
268 for name in EASTASIANWIDTH_NAMES:
269 print >>fp, " \"%s\"," % name
270 print >>fp, " NULL"
271 print >>fp, "};"
272
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000273 print >>fp, "static const char *decomp_prefix[] = {"
274 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000275 print >>fp, " \"%s\"," % name
276 print >>fp, " NULL"
277 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000278
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000279 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000280 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000281
Fred Drake9c685052000-10-26 03:56:46 +0000282 print >>fp, "/* index tables for the database records */"
283 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284 Array("index1", index1).dump(fp, trace)
285 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000286
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000287 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000288 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000289
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 print >>fp, "/* decomposition data */"
291 Array("decomp_data", decomp_data).dump(fp, trace)
292
Fred Drake9c685052000-10-26 03:56:46 +0000293 print >>fp, "/* index tables for the decomposition data */"
294 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000295 Array("decomp_index1", index1).dump(fp, trace)
296 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Martin v. Löwis677bde22002-11-23 22:08:15 +0000298 index, index2, shift = splitbins(comp_data, trace)
299 print >>fp, "/* NFC pairs */"
300 print >>fp, "#define COMP_SHIFT", shift
301 Array("comp_index", index).dump(fp, trace)
302 Array("comp_data", index2).dump(fp, trace)
303
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000304 # Generate delta tables for old versions
305 for version, table, normalization in unicode.changed:
306 cversion = version.replace(".","_")
307 records = [table[0]]
308 cache = {table[0]:0}
309 index = [0] * len(table)
310 for i, record in enumerate(table):
311 try:
312 index[i] = cache[record]
313 except KeyError:
314 index[i] = cache[record] = len(records)
315 records.append(record)
316 index1, index2, shift = splitbins(index, trace)
317 print >>fp, "static const change_record change_records_%s[] = {" % cversion
318 for record in records:
319 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
320 print >>fp, "};"
321 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
322 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
323 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
324 print >>fp, "{"
325 print >>fp, "\tint index;"
326 print >>fp, "\tif (n >= 0x110000) index = 0;"
327 print >>fp, "\telse {"
328 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
329 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
330 (cversion, shift, ((1<<shift)-1))
331 print >>fp, "\t}"
332 print >>fp, "\treturn change_records_%s+index;" % cversion
333 print >>fp, "}\n"
334 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
335 print >>fp, "{"
336 print >>fp, "\tswitch(n) {"
337 for k, v in normalization:
338 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
339 print >>fp, "\tdefault: return 0;"
340 print >>fp, "\t}\n}\n"
341
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000342 fp.close()
343
344# --------------------------------------------------------------------
345# unicode character type tables
346
347def makeunicodetype(unicode, trace):
348
349 FILE = "Objects/unicodetype_db.h"
350
351 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352
353 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000354 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000355 table = [dummy]
356 cache = {0: dummy}
357 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000358 numeric = {}
359 spaces = []
360 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361
362 for char in unicode.chars:
363 record = unicode.table[char]
364 if record:
365 # extract database properties
366 category = record[2]
367 bidirectional = record[4]
368 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000369 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
371 flags |= ALPHA_MASK
372 if category == "Ll":
373 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000374 if category == "Zl" or bidirectional == "B":
375 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000376 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if category == "Zs" or bidirectional in ("WS", "B", "S"):
378 flags |= SPACE_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000379 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000380 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 flags |= TITLE_MASK
382 if category == "Lu":
383 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000384 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000386 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000388 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000390 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000392 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000394 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000396 # UCD.html says that a missing title char means that
397 # it defaults to the uppercase character, not to the
398 # character itself. Apparently, in the current UCD (5.x)
399 # this feature is never used
400 title = upper
401 upper_d = upper - char
402 lower_d = lower - char
403 title_d = title - char
404 if -32768 <= upper_d <= 32767 and \
405 -32768 <= lower_d <= 32767 and \
406 -32768 <= title_d <= 32767:
407 # use deltas
408 upper = upper_d & 0xffff
409 lower = lower_d & 0xffff
410 title = title_d & 0xffff
411 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000412 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000413 # decimal digit, integer digit
414 decimal = 0
415 if record[6]:
416 flags |= DECIMAL_MASK
417 decimal = int(record[6])
418 digit = 0
419 if record[7]:
420 flags |= DIGIT_MASK
421 digit = int(record[7])
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000422 if record[8]:
423 flags |= NUMERIC_MASK
424 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000425 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000426 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000427 )
428 # add entry to index and item tables
429 i = cache.get(item)
430 if i is None:
431 cache[item] = i = len(table)
432 table.append(item)
433 index[char] = i
434
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000435 print len(table), "unique character type entries"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000436 print sum(map(len, numeric.values())), "numeric code points"
437 print len(spaces), "whitespace code points"
438 print len(linebreaks), "linebreak code points"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000439
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000440 print "--- Writing", FILE, "..."
441
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000442 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000443 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
444 print >>fp
445 print >>fp, "/* a list of unique character type descriptors */"
446 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000447 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000448 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
449 print >>fp, "};"
450 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000451
452 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000453 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000454
Fred Drake9c685052000-10-26 03:56:46 +0000455 print >>fp, "/* type indexes */"
456 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457 Array("index1", index1).dump(fp, trace)
458 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000459
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000460 # Generate code for _PyUnicode_ToNumeric()
Florent Xiclunadc364722010-03-15 14:00:58 +0000461 numeric_items = sorted(numeric.items())
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000462 print >>fp, '/* Returns the numeric value as double for Unicode characters'
463 print >>fp, ' * having this property, -1.0 otherwise.'
464 print >>fp, ' */'
465 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
466 print >>fp, '{'
467 print >>fp, ' switch (ch) {'
468 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc5c92d432009-10-13 21:29:34 +0000469 # Turn text into float literals
470 parts = value.split('/')
471 parts = [repr(float(part)) for part in parts]
472 value = '/'.join(parts)
473
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000474 haswide = False
475 hasnonewide = False
476 codepoints.sort()
477 for codepoint in codepoints:
478 if codepoint < 0x10000:
479 hasnonewide = True
480 if codepoint >= 0x10000 and not haswide:
481 print >>fp, '#ifdef Py_UNICODE_WIDE'
482 haswide = True
483 print >>fp, ' case 0x%04X:' % (codepoint,)
484 if haswide and hasnonewide:
485 print >>fp, '#endif'
486 print >>fp, ' return (double) %s;' % (value,)
487 if haswide and not hasnonewide:
488 print >>fp, '#endif'
489 print >>fp,' }'
490 print >>fp,' return -1.0;'
491 print >>fp,'}'
492 print >>fp
493
494 # Generate code for _PyUnicode_IsWhitespace()
495 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
496 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
497 print >>fp, " */"
498 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
499 print >>fp, '{'
500 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
501 print >>fp, ' return iswspace(ch);'
502 print >>fp, '#else'
503 print >>fp, ' switch (ch) {'
504
505 haswide = False
506 hasnonewide = False
Florent Xiclunadc364722010-03-15 14:00:58 +0000507 for codepoint in sorted(spaces):
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000508 if codepoint < 0x10000:
509 hasnonewide = True
510 if codepoint >= 0x10000 and not haswide:
511 print >>fp, '#ifdef Py_UNICODE_WIDE'
512 haswide = True
513 print >>fp, ' case 0x%04X:' % (codepoint,)
514 if haswide and hasnonewide:
515 print >>fp, '#endif'
516 print >>fp, ' return 1;'
517 if haswide and not hasnonewide:
518 print >>fp, '#endif'
519
520 print >>fp,' }'
521 print >>fp,' return 0;'
522 print >>fp, '#endif'
523 print >>fp,'}'
524 print >>fp
525
526 # Generate code for _PyUnicode_IsLinebreak()
527 print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
528 print >>fp, " * 'Zp' or type 'B', 0 otherwise."
529 print >>fp, " */"
530 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
531 print >>fp, '{'
532 print >>fp, ' switch (ch) {'
533 haswide = False
534 hasnonewide = False
Florent Xiclunadc364722010-03-15 14:00:58 +0000535 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000536 if codepoint < 0x10000:
537 hasnonewide = True
538 if codepoint >= 0x10000 and not haswide:
539 print >>fp, '#ifdef Py_UNICODE_WIDE'
540 haswide = True
541 print >>fp, ' case 0x%04X:' % (codepoint,)
542 if haswide and hasnonewide:
543 print >>fp, '#endif'
544 print >>fp, ' return 1;'
545 if haswide and not hasnonewide:
546 print >>fp, '#endif'
547
548 print >>fp,' }'
549 print >>fp,' return 0;'
550 print >>fp,'}'
551 print >>fp
552
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000553 fp.close()
554
555# --------------------------------------------------------------------
556# unicode name database
557
558def makeunicodename(unicode, trace):
559
560 FILE = "Modules/unicodename_db.h"
561
562 print "--- Preparing", FILE, "..."
563
564 # collect names
565 names = [None] * len(unicode.chars)
566
567 for char in unicode.chars:
568 record = unicode.table[char]
569 if record:
570 name = record[1].strip()
571 if name and name[0] != "<":
572 names[char] = name + chr(0)
573
574 print len(filter(lambda n: n is not None, names)), "distinct names"
575
576 # collect unique words from names (note that we differ between
577 # words inside a sentence, and words ending a sentence. the
578 # latter includes the trailing null byte.
579
580 words = {}
581 n = b = 0
582 for char in unicode.chars:
583 name = names[char]
584 if name:
585 w = name.split()
586 b = b + len(name)
587 n = n + len(w)
588 for w in w:
589 l = words.get(w)
590 if l:
591 l.append(None)
592 else:
593 words[w] = [len(words)]
594
595 print n, "words in text;", b, "bytes"
596
597 wordlist = words.items()
598
Martin v. Löwis97225da2002-11-24 23:05:09 +0000599 # sort on falling frequency, then by name
Florent Xiclunadc364722010-03-15 14:00:58 +0000600 def word_key(a):
601 aword, alist = a
602 return -len(alist), aword
603 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000604
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000605 # figure out how many phrasebook escapes we need
606 escapes = 0
607 while escapes * 256 < len(wordlist):
608 escapes = escapes + 1
609 print escapes, "escapes"
610
611 short = 256 - escapes
612
613 assert short > 0
614
615 print short, "short indexes in lexicon"
616
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617 # statistics
618 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000619 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000620 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000621 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000623 # pick the most commonly used words, and sort the rest on falling
624 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000625
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000626 wordlist, wordtail = wordlist[:short], wordlist[short:]
Florent Xiclunadc364722010-03-15 14:00:58 +0000627 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628 wordlist.extend(wordtail)
629
630 # generate lexicon from words
631
632 lexicon_offset = [0]
633 lexicon = ""
634 words = {}
635
636 # build a lexicon string
637 offset = 0
638 for w, x in wordlist:
639 # encoding: bit 7 indicates last character in word (chr(128)
640 # indicates the last character in an entire string)
641 ww = w[:-1] + chr(ord(w[-1])+128)
642 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000643 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000644 if o < 0:
645 o = offset
646 lexicon = lexicon + ww
647 offset = offset + len(w)
648 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650
651 lexicon = map(ord, lexicon)
652
653 # generate phrasebook from names and lexicon
654 phrasebook = [0]
655 phrasebook_offset = [0] * len(unicode.chars)
656 for char in unicode.chars:
657 name = names[char]
658 if name:
659 w = name.split()
660 phrasebook_offset[char] = len(phrasebook)
661 for w in w:
662 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000663 if i < short:
664 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000666 # store as two bytes
667 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000668 phrasebook.append(i&255)
669
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000670 assert getsize(phrasebook) == 1
671
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000672 #
673 # unicode name hash table
674
675 # extract names
676 data = []
677 for char in unicode.chars:
678 record = unicode.table[char]
679 if record:
680 name = record[1].strip()
681 if name and name[0] != "<":
682 data.append((name, char))
683
684 # the magic number 47 was chosen to minimize the number of
685 # collisions on the current data set. if you like, change it
686 # and see what happens...
687
688 codehash = Hash("code", data, 47)
689
690 print "--- Writing", FILE, "..."
691
692 fp = open(FILE, "w")
693 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
694 print >>fp
695 print >>fp, "#define NAME_MAXLEN", 256
696 print >>fp
697 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000698 Array("lexicon", lexicon).dump(fp, trace)
699 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000700
701 # split decomposition index table
702 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
703
704 print >>fp, "/* code->name phrasebook */"
705 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000706 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 Array("phrasebook", phrasebook).dump(fp, trace)
709 Array("phrasebook_offset1", offset1).dump(fp, trace)
710 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000711
712 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000713 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000714
715 fp.close()
716
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000717
718def merge_old_version(version, new, old):
719 # Changes to exclusion file not implemented yet
720 if old.exclusions != new.exclusions:
721 raise NotImplementedError, "exclusions differ"
722
723 # In these change records, 0xFF means "no change"
724 bidir_changes = [0xFF]*0x110000
725 category_changes = [0xFF]*0x110000
726 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000727 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000728 # In numeric data, 0 means "no change",
729 # -1 means "did not have a numeric value
730 numeric_changes = [0] * 0x110000
731 # normalization_changes is a list of key-value pairs
732 normalization_changes = []
733 for i in range(0x110000):
734 if new.table[i] is None:
735 # Characters unassigned in the new version ought to
736 # be unassigned in the old one
737 assert old.table[i] is None
738 continue
739 # check characters unassigned in the old version
740 if old.table[i] is None:
741 # category 0 is "unassigned"
742 category_changes[i] = 0
743 continue
744 # check characters that differ
745 if old.table[i] != new.table[i]:
746 for k in range(len(old.table[i])):
747 if old.table[i][k] != new.table[i][k]:
748 value = old.table[i][k]
749 if k == 2:
750 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
751 category_changes[i] = CATEGORY_NAMES.index(value)
752 elif k == 4:
753 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
754 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
755 elif k == 5:
756 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
757 # We assume that all normalization changes are in 1:1 mappings
758 assert " " not in value
759 normalization_changes.append((i, value))
760 elif k == 6:
761 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
762 # we only support changes where the old value is a single digit
763 assert value in "0123456789"
764 decimal_changes[i] = int(value)
765 elif k == 8:
766 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
767 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000768 if not value:
769 numeric_changes[i] = -1
770 else:
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000771 numeric_changes[i] = float(value)
772 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000773 elif k == 9:
774 if value == 'Y':
775 mirrored_changes[i] = '1'
776 else:
777 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000778 elif k == 11:
779 # change to ISO comment, ignore
780 pass
781 elif k == 12:
782 # change to simple uppercase mapping; ignore
783 pass
784 elif k == 13:
785 # change to simple lowercase mapping; ignore
786 pass
787 elif k == 14:
788 # change to simple titlecase mapping; ignore
789 pass
790 else:
791 class Difference(Exception):pass
792 raise Difference, (hex(i), k, old.table[i], new.table[i])
793 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000794 decimal_changes, mirrored_changes,
795 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000796 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000797
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000798
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000799# --------------------------------------------------------------------
800# the following support code is taken from the unidb utilities
801# Copyright (c) 1999-2000 by Secret Labs AB
802
803# load a unicode-data file from disk
804
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000805class UnicodeData:
806
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000807 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitroue988e282009-04-27 21:53:26 +0000808 derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000809 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000810 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000811 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000812 while 1:
813 s = file.readline()
814 if not s:
815 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000816 s = s.strip().split(";")
817 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000818 table[char] = s
819
Martin v. Löwis97225da2002-11-24 23:05:09 +0000820 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000821 if expand:
822 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000823 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000824 s = table[i]
825 if s:
826 if s[1][-6:] == "First>":
827 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000828 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000829 elif s[1][-5:] == "Last>":
830 s[1] = ""
831 field = None
832 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000833 f2 = field[:]
834 f2[0] = "%X" % i
835 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000836
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000837 # public attributes
838 self.filename = filename
839 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000840 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000841
Martin v. Löwis677bde22002-11-23 22:08:15 +0000842 file = open(exclusions)
843 self.exclusions = {}
844 for s in file:
845 s = s.strip()
846 if not s:
847 continue
848 if s[0] == '#':
849 continue
850 char = int(s.split()[0],16)
851 self.exclusions[char] = 1
852
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000853 widths = [None] * 0x110000
854 for s in open(eastasianwidth):
855 s = s.strip()
856 if not s:
857 continue
858 if s[0] == '#':
859 continue
860 s = s.split()[0].split(';')
861 if '..' in s[0]:
862 first, last = [int(c, 16) for c in s[0].split('..')]
863 chars = range(first, last+1)
864 else:
865 chars = [int(s[0], 16)]
866 for char in chars:
867 widths[char] = s[1]
868 for i in range(0, 0x110000):
869 if table[i] is not None:
870 table[i].append(widths[i])
Antoine Pitroue988e282009-04-27 21:53:26 +0000871 if derivednormalizationprops:
872 quickchecks = [0] * 0x110000 # default is Yes
873 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
874 for s in open(derivednormalizationprops):
875 if '#' in s:
876 s = s[:s.index('#')]
877 s = [i.strip() for i in s.split(';')]
878 if len(s) < 2 or s[1] not in qc_order:
879 continue
880 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
881 quickcheck_shift = qc_order.index(s[1])*2
882 quickcheck <<= quickcheck_shift
883 if '..' not in s[0]:
884 first = last = int(s[0], 16)
885 else:
886 first, last = [int(c, 16) for c in s[0].split('..')]
887 for char in range(first, last+1):
888 assert not (quickchecks[char]>>quickcheck_shift)&3
889 quickchecks[char] |= quickcheck
890 for i in range(0, 0x110000):
891 if table[i] is not None:
892 table[i].append(quickchecks[i])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000893
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000894 for line in open(unihan):
895 if not line.startswith('U+'):
896 continue
897 code, tag, value = line.split(None, 3)[:3]
898 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
899 'kOtherNumeric'):
900 continue
901 value = value.strip().replace(',', '')
902 i = int(code[2:], 16)
903 # Patch the numeric field
904 if table[i] is not None:
905 table[i][8] = value
906
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000907 def uselatin1(self):
908 # restrict character range to ISO Latin 1
909 self.chars = range(256)
910
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000911# hash table tools
912
913# this is a straight-forward reimplementation of Python's built-in
914# dictionary type, using a static data structure, and a custom string
915# hash algorithm.
916
917def myhash(s, magic):
918 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000919 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000920 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000921 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000922 if ix:
923 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
924 return h
925
926SIZES = [
927 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
928 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
929 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
930 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
931]
932
933class Hash:
934 def __init__(self, name, data, magic):
935 # turn a (key, value) list into a static hash table structure
936
937 # determine table size
938 for size, poly in SIZES:
939 if size > len(data):
940 poly = size + poly
941 break
942 else:
943 raise AssertionError, "ran out of polynominals"
944
945 print size, "slots in hash table"
946
947 table = [None] * size
948
949 mask = size-1
950
951 n = 0
952
953 hash = myhash
954
955 # initialize hash table
956 for key, value in data:
957 h = hash(key, magic)
958 i = (~h) & mask
959 v = table[i]
960 if v is None:
961 table[i] = value
962 continue
963 incr = (h ^ (h >> 3)) & mask;
964 if not incr:
965 incr = mask
966 while 1:
967 n = n + 1
968 i = (i + incr) & mask
969 v = table[i]
970 if v is None:
971 table[i] = value
972 break
973 incr = incr << 1
974 if incr > mask:
975 incr = incr ^ poly
976
977 print n, "collisions"
978 self.collisions = n
979
980 for i in range(len(table)):
981 if table[i] is None:
982 table[i] = 0
983
984 self.data = Array(name + "_hash", table)
985 self.magic = magic
986 self.name = name
987 self.size = size
988 self.poly = poly
989
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000990 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000991 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000992 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000993 file.write("#define %s_magic %d\n" % (self.name, self.magic))
994 file.write("#define %s_size %d\n" % (self.name, self.size))
995 file.write("#define %s_poly %d\n" % (self.name, self.poly))
996
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000997# stuff to deal with arrays of unsigned integers
998
999class Array:
1000
1001 def __init__(self, name, data):
1002 self.name = name
1003 self.data = data
1004
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001005 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001006 # write data to file, as a C array
1007 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001008 if trace:
1009 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001010 file.write("static ")
1011 if size == 1:
1012 file.write("unsigned char")
1013 elif size == 2:
1014 file.write("unsigned short")
1015 else:
1016 file.write("unsigned int")
1017 file.write(" " + self.name + "[] = {\n")
1018 if self.data:
1019 s = " "
1020 for item in self.data:
1021 i = str(item) + ", "
1022 if len(s) + len(i) > 78:
1023 file.write(s + "\n")
1024 s = " " + i
1025 else:
1026 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001027 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001028 file.write(s + "\n")
1029 file.write("};\n\n")
1030
1031def getsize(data):
1032 # return smallest possible integer size for the given array
1033 maxdata = max(data)
1034 if maxdata < 256:
1035 return 1
1036 elif maxdata < 65536:
1037 return 2
1038 else:
1039 return 4
1040
Tim Peters21013482000-09-25 07:13:41 +00001041def splitbins(t, trace=0):
1042 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1043
1044 t is a sequence of ints. This function can be useful to save space if
1045 many of the ints are the same. t1 and t2 are lists of ints, and shift
1046 is an int, chosen to minimize the combined size of t1 and t2 (in C
1047 code), and where for each i in range(len(t)),
1048 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1049 where mask is a bitmask isolating the last "shift" bits.
1050
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001051 If optional arg trace is non-zero (default zero), progress info
1052 is printed to sys.stderr. The higher the value, the more info
1053 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001054 """
1055
Tim Peters21013482000-09-25 07:13:41 +00001056 if trace:
1057 def dump(t1, t2, shift, bytes):
1058 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1059 len(t1), len(t2), shift, bytes)
1060 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1061 "bytes"
1062 n = len(t)-1 # last valid index
1063 maxshift = 0 # the most we can shift n and still have something left
1064 if n > 0:
1065 while n >> 1:
1066 n >>= 1
1067 maxshift += 1
1068 del n
1069 bytes = sys.maxint # smallest total size so far
1070 t = tuple(t) # so slices can be dict keys
1071 for shift in range(maxshift + 1):
1072 t1 = []
1073 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001074 size = 2**shift
1075 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001076 for i in range(0, len(t), size):
1077 bin = t[i:i+size]
1078 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001079 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001080 index = len(t2)
1081 bincache[bin] = index
1082 t2.extend(bin)
1083 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001084 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001085 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001086 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001087 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001088 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001089 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001090 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001091 t1, t2, shift = best
1092 if trace:
1093 print >>sys.stderr, "Best:",
1094 dump(t1, t2, shift, bytes)
1095 if __debug__:
1096 # exhaustively verify that the decomposition is correct
1097 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1098 for i in xrange(len(t)):
1099 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1100 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001101
1102if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001103 maketables(1)