blob: 330eb2d9261153ad94e9c87ca7ff54328b01c944 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Martin v. Löwis24329ba2008-09-10 13:38:12 +000033UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000037UNIHAN = "Unihan%s.txt"
Antoine Pitroue988e282009-04-27 21:53:26 +000038DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000039
40old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000041
42CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
43 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
44 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 "So" ]
46
47BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
48 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 "ON" ]
50
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000051EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
52
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000053# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000054ALPHA_MASK = 0x01
55DECIMAL_MASK = 0x02
56DIGIT_MASK = 0x04
57LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059SPACE_MASK = 0x20
60TITLE_MASK = 0x40
61UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000062NODELTA_MASK = 0x100
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000063NUMERIC_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000065def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000066
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000068
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069 version = ""
70 unicode = UnicodeData(UNICODE_DATA % version,
71 COMPOSITION_EXCLUSIONS % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000072 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000073 UNIHAN % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000074 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000075
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000076 print len(filter(None, unicode.table)), "characters"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 for version in old_versions:
79 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
80 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
81 COMPOSITION_EXCLUSIONS % ("-"+version),
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000082 EASTASIAN_WIDTH % ("-"+version),
83 UNIHAN % ("-"+version))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 print len(filter(None, old_unicode.table)), "characters"
85 merge_old_version(version, unicode, old_unicode)
86
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000087 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000088 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000089 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000090
91# --------------------------------------------------------------------
92# unicode character properties
93
94def makeunicodedata(unicode, trace):
95
Antoine Pitroue988e282009-04-27 21:53:26 +000096 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000097 table = [dummy]
98 cache = {0: dummy}
99 index = [0] * len(unicode.chars)
100
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101 FILE = "Modules/unicodedata_db.h"
102
103 print "--- Preparing", FILE, "..."
104
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000105 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 for char in unicode.chars:
108 record = unicode.table[char]
109 if record:
110 # extract database properties
111 category = CATEGORY_NAMES.index(record[2])
112 combining = int(record[3])
113 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
114 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000115 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitroue988e282009-04-27 21:53:26 +0000116 normalizationquickcheck = record[16]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 item = (
Antoine Pitroue988e282009-04-27 21:53:26 +0000118 category, combining, bidirectional, mirrored, eastasianwidth,
119 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000120 )
121 # add entry to index and item tables
122 i = cache.get(item)
123 if i is None:
124 cache[item] = i = len(table)
125 table.append(item)
126 index[char] = i
127
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000128 # 2) decomposition data
129
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 decomp_data = [0]
131 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000133 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000134
Martin v. Löwis677bde22002-11-23 22:08:15 +0000135 comp_pairs = []
136 comp_first = [None] * len(unicode.chars)
137 comp_last = [None] * len(unicode.chars)
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000143 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000144 if len(decomp) > 19:
145 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000146 # prefix
147 if decomp[0][0] == "<":
148 prefix = decomp.pop(0)
149 else:
150 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000151 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 i = decomp_prefix.index(prefix)
153 except ValueError:
154 i = len(decomp_prefix)
155 decomp_prefix.append(prefix)
156 prefix = i
157 assert prefix < 256
158 # content
159 decomp = [prefix + (len(decomp)<<8)] +\
160 map(lambda s: int(s, 16), decomp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000161 # Collect NFC pairs
162 if not prefix and len(decomp) == 3 and \
163 char not in unicode.exclusions and \
164 unicode.table[decomp[1]][3] == "0":
165 p, l, r = decomp
166 comp_first[l] = 1
167 comp_last[r] = 1
168 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 try:
170 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 except ValueError:
172 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_data.extend(decomp)
174 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000175 else:
176 i = 0
177 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000178
Martin v. Löwis677bde22002-11-23 22:08:15 +0000179 f = l = 0
180 comp_first_ranges = []
181 comp_last_ranges = []
182 prev_f = prev_l = None
183 for i in unicode.chars:
184 if comp_first[i] is not None:
185 comp_first[i] = f
186 f += 1
187 if prev_f is None:
188 prev_f = (i,i)
189 elif prev_f[1]+1 == i:
190 prev_f = prev_f[0],i
191 else:
192 comp_first_ranges.append(prev_f)
193 prev_f = (i,i)
194 if comp_last[i] is not None:
195 comp_last[i] = l
196 l += 1
197 if prev_l is None:
198 prev_l = (i,i)
199 elif prev_l[1]+1 == i:
200 prev_l = prev_l[0],i
201 else:
202 comp_last_ranges.append(prev_l)
203 prev_l = (i,i)
204 comp_first_ranges.append(prev_f)
205 comp_last_ranges.append(prev_l)
206 total_first = f
207 total_last = l
208
209 comp_data = [0]*(total_first*total_last)
210 for f,l,char in comp_pairs:
211 f = comp_first[f]
212 l = comp_last[l]
213 comp_data[f*total_last+l] = char
214
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000215 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000216 print len(decomp_prefix), "unique decomposition prefixes"
217 print len(decomp_data), "unique decomposition entries:",
218 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000219 print total_first, "first characters in NFC"
220 print total_last, "last characters in NFC"
221 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000222
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000223 print "--- Writing", FILE, "..."
224
Fred Drake9c685052000-10-26 03:56:46 +0000225 fp = open(FILE, "w")
226 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
227 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000228 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000229 print >>fp, "/* a list of unique database records */"
230 print >>fp, \
231 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000232 for item in table:
Antoine Pitroue988e282009-04-27 21:53:26 +0000233 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000234 print >>fp, "};"
235 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000236
Martin v. Löwis677bde22002-11-23 22:08:15 +0000237 print >>fp, "/* Reindexing of NFC first characters. */"
238 print >>fp, "#define TOTAL_FIRST",total_first
239 print >>fp, "#define TOTAL_LAST",total_last
240 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000241 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000242 for start,end in comp_first_ranges:
243 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
244 print >>fp," {0,0,0}"
245 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000246 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247 for start,end in comp_last_ranges:
248 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
249 print >>fp," {0,0,0}"
250 print >>fp,"};\n"
251
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000252 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000253 # the support code moved into unicodedatabase.c
254
Fred Drake9c685052000-10-26 03:56:46 +0000255 print >>fp, "/* string literals */"
256 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000257 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000258 print >>fp, " \"%s\"," % name
259 print >>fp, " NULL"
260 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261
Fred Drake9c685052000-10-26 03:56:46 +0000262 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000263 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000264 print >>fp, " \"%s\"," % name
265 print >>fp, " NULL"
266 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000267
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000268 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
269 for name in EASTASIANWIDTH_NAMES:
270 print >>fp, " \"%s\"," % name
271 print >>fp, " NULL"
272 print >>fp, "};"
273
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000274 print >>fp, "static const char *decomp_prefix[] = {"
275 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000276 print >>fp, " \"%s\"," % name
277 print >>fp, " NULL"
278 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000279
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000280 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000281 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000282
Fred Drake9c685052000-10-26 03:56:46 +0000283 print >>fp, "/* index tables for the database records */"
284 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000285 Array("index1", index1).dump(fp, trace)
286 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000289 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000290
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000291 print >>fp, "/* decomposition data */"
292 Array("decomp_data", decomp_data).dump(fp, trace)
293
Fred Drake9c685052000-10-26 03:56:46 +0000294 print >>fp, "/* index tables for the decomposition data */"
295 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000296 Array("decomp_index1", index1).dump(fp, trace)
297 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000298
Martin v. Löwis677bde22002-11-23 22:08:15 +0000299 index, index2, shift = splitbins(comp_data, trace)
300 print >>fp, "/* NFC pairs */"
301 print >>fp, "#define COMP_SHIFT", shift
302 Array("comp_index", index).dump(fp, trace)
303 Array("comp_data", index2).dump(fp, trace)
304
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000305 # Generate delta tables for old versions
306 for version, table, normalization in unicode.changed:
307 cversion = version.replace(".","_")
308 records = [table[0]]
309 cache = {table[0]:0}
310 index = [0] * len(table)
311 for i, record in enumerate(table):
312 try:
313 index[i] = cache[record]
314 except KeyError:
315 index[i] = cache[record] = len(records)
316 records.append(record)
317 index1, index2, shift = splitbins(index, trace)
318 print >>fp, "static const change_record change_records_%s[] = {" % cversion
319 for record in records:
320 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
321 print >>fp, "};"
322 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
323 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
324 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
325 print >>fp, "{"
326 print >>fp, "\tint index;"
327 print >>fp, "\tif (n >= 0x110000) index = 0;"
328 print >>fp, "\telse {"
329 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
330 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
331 (cversion, shift, ((1<<shift)-1))
332 print >>fp, "\t}"
333 print >>fp, "\treturn change_records_%s+index;" % cversion
334 print >>fp, "}\n"
335 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
336 print >>fp, "{"
337 print >>fp, "\tswitch(n) {"
338 for k, v in normalization:
339 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
340 print >>fp, "\tdefault: return 0;"
341 print >>fp, "\t}\n}\n"
342
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000343 fp.close()
344
345# --------------------------------------------------------------------
346# unicode character type tables
347
348def makeunicodetype(unicode, trace):
349
350 FILE = "Objects/unicodetype_db.h"
351
352 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000353
354 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000355 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000356 table = [dummy]
357 cache = {0: dummy}
358 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000359 numeric = {}
360 spaces = []
361 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 for char in unicode.chars:
364 record = unicode.table[char]
365 if record:
366 # extract database properties
367 category = record[2]
368 bidirectional = record[4]
369 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000370 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
372 flags |= ALPHA_MASK
373 if category == "Ll":
374 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000375 if category == "Zl" or bidirectional == "B":
376 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000377 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 if category == "Zs" or bidirectional in ("WS", "B", "S"):
379 flags |= SPACE_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000380 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000381 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382 flags |= TITLE_MASK
383 if category == "Lu":
384 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000385 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000387 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000389 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000390 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000391 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000393 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000395 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000397 # UCD.html says that a missing title char means that
398 # it defaults to the uppercase character, not to the
399 # character itself. Apparently, in the current UCD (5.x)
400 # this feature is never used
401 title = upper
402 upper_d = upper - char
403 lower_d = lower - char
404 title_d = title - char
405 if -32768 <= upper_d <= 32767 and \
406 -32768 <= lower_d <= 32767 and \
407 -32768 <= title_d <= 32767:
408 # use deltas
409 upper = upper_d & 0xffff
410 lower = lower_d & 0xffff
411 title = title_d & 0xffff
412 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000413 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000414 # decimal digit, integer digit
415 decimal = 0
416 if record[6]:
417 flags |= DECIMAL_MASK
418 decimal = int(record[6])
419 digit = 0
420 if record[7]:
421 flags |= DIGIT_MASK
422 digit = int(record[7])
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000423 if record[8]:
424 flags |= NUMERIC_MASK
425 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000427 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428 )
429 # add entry to index and item tables
430 i = cache.get(item)
431 if i is None:
432 cache[item] = i = len(table)
433 table.append(item)
434 index[char] = i
435
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000436 print len(table), "unique character type entries"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000437 print sum(map(len, numeric.values())), "numeric code points"
438 print len(spaces), "whitespace code points"
439 print len(linebreaks), "linebreak code points"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000441 print "--- Writing", FILE, "..."
442
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000443 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000444 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
445 print >>fp
446 print >>fp, "/* a list of unique character type descriptors */"
447 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000448 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000449 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
450 print >>fp, "};"
451 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
453 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000454 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000455
Fred Drake9c685052000-10-26 03:56:46 +0000456 print >>fp, "/* type indexes */"
457 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 Array("index1", index1).dump(fp, trace)
459 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000461 # Generate code for _PyUnicode_ToNumeric()
462 numeric_items = numeric.items()
463 numeric_items.sort()
464 print >>fp, '/* Returns the numeric value as double for Unicode characters'
465 print >>fp, ' * having this property, -1.0 otherwise.'
466 print >>fp, ' */'
467 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
468 print >>fp, '{'
469 print >>fp, ' switch (ch) {'
470 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc5c92d432009-10-13 21:29:34 +0000471 # Turn text into float literals
472 parts = value.split('/')
473 parts = [repr(float(part)) for part in parts]
474 value = '/'.join(parts)
475
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000476 haswide = False
477 hasnonewide = False
478 codepoints.sort()
479 for codepoint in codepoints:
480 if codepoint < 0x10000:
481 hasnonewide = True
482 if codepoint >= 0x10000 and not haswide:
483 print >>fp, '#ifdef Py_UNICODE_WIDE'
484 haswide = True
485 print >>fp, ' case 0x%04X:' % (codepoint,)
486 if haswide and hasnonewide:
487 print >>fp, '#endif'
488 print >>fp, ' return (double) %s;' % (value,)
489 if haswide and not hasnonewide:
490 print >>fp, '#endif'
491 print >>fp,' }'
492 print >>fp,' return -1.0;'
493 print >>fp,'}'
494 print >>fp
495
496 # Generate code for _PyUnicode_IsWhitespace()
497 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
498 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
499 print >>fp, " */"
500 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
501 print >>fp, '{'
502 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
503 print >>fp, ' return iswspace(ch);'
504 print >>fp, '#else'
505 print >>fp, ' switch (ch) {'
506
507 haswide = False
508 hasnonewide = False
509 spaces.sort()
510 for codepoint in spaces:
511 if codepoint < 0x10000:
512 hasnonewide = True
513 if codepoint >= 0x10000 and not haswide:
514 print >>fp, '#ifdef Py_UNICODE_WIDE'
515 haswide = True
516 print >>fp, ' case 0x%04X:' % (codepoint,)
517 if haswide and hasnonewide:
518 print >>fp, '#endif'
519 print >>fp, ' return 1;'
520 if haswide and not hasnonewide:
521 print >>fp, '#endif'
522
523 print >>fp,' }'
524 print >>fp,' return 0;'
525 print >>fp, '#endif'
526 print >>fp,'}'
527 print >>fp
528
529 # Generate code for _PyUnicode_IsLinebreak()
530 print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
531 print >>fp, " * 'Zp' or type 'B', 0 otherwise."
532 print >>fp, " */"
533 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
534 print >>fp, '{'
535 print >>fp, ' switch (ch) {'
536 haswide = False
537 hasnonewide = False
538 linebreaks.sort()
539 for codepoint in linebreaks:
540 if codepoint < 0x10000:
541 hasnonewide = True
542 if codepoint >= 0x10000 and not haswide:
543 print >>fp, '#ifdef Py_UNICODE_WIDE'
544 haswide = True
545 print >>fp, ' case 0x%04X:' % (codepoint,)
546 if haswide and hasnonewide:
547 print >>fp, '#endif'
548 print >>fp, ' return 1;'
549 if haswide and not hasnonewide:
550 print >>fp, '#endif'
551
552 print >>fp,' }'
553 print >>fp,' return 0;'
554 print >>fp,'}'
555 print >>fp
556
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000557 fp.close()
558
559# --------------------------------------------------------------------
560# unicode name database
561
562def makeunicodename(unicode, trace):
563
564 FILE = "Modules/unicodename_db.h"
565
566 print "--- Preparing", FILE, "..."
567
568 # collect names
569 names = [None] * len(unicode.chars)
570
571 for char in unicode.chars:
572 record = unicode.table[char]
573 if record:
574 name = record[1].strip()
575 if name and name[0] != "<":
576 names[char] = name + chr(0)
577
578 print len(filter(lambda n: n is not None, names)), "distinct names"
579
580 # collect unique words from names (note that we differ between
581 # words inside a sentence, and words ending a sentence. the
582 # latter includes the trailing null byte.
583
584 words = {}
585 n = b = 0
586 for char in unicode.chars:
587 name = names[char]
588 if name:
589 w = name.split()
590 b = b + len(name)
591 n = n + len(w)
592 for w in w:
593 l = words.get(w)
594 if l:
595 l.append(None)
596 else:
597 words[w] = [len(words)]
598
599 print n, "words in text;", b, "bytes"
600
601 wordlist = words.items()
602
Martin v. Löwis97225da2002-11-24 23:05:09 +0000603 # sort on falling frequency, then by name
604 def cmpwords((aword, alist),(bword, blist)):
605 r = -cmp(len(alist),len(blist))
606 if r:
607 return r
608 return cmp(aword, bword)
609 wordlist.sort(cmpwords)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000610
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000611 # figure out how many phrasebook escapes we need
612 escapes = 0
613 while escapes * 256 < len(wordlist):
614 escapes = escapes + 1
615 print escapes, "escapes"
616
617 short = 256 - escapes
618
619 assert short > 0
620
621 print short, "short indexes in lexicon"
622
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000623 # statistics
624 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000625 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000626 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000627 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629 # pick the most commonly used words, and sort the rest on falling
630 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000631
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000632 wordlist, wordtail = wordlist[:short], wordlist[short:]
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000633 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
634 wordlist.extend(wordtail)
635
636 # generate lexicon from words
637
638 lexicon_offset = [0]
639 lexicon = ""
640 words = {}
641
642 # build a lexicon string
643 offset = 0
644 for w, x in wordlist:
645 # encoding: bit 7 indicates last character in word (chr(128)
646 # indicates the last character in an entire string)
647 ww = w[:-1] + chr(ord(w[-1])+128)
648 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000649 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650 if o < 0:
651 o = offset
652 lexicon = lexicon + ww
653 offset = offset + len(w)
654 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000656
657 lexicon = map(ord, lexicon)
658
659 # generate phrasebook from names and lexicon
660 phrasebook = [0]
661 phrasebook_offset = [0] * len(unicode.chars)
662 for char in unicode.chars:
663 name = names[char]
664 if name:
665 w = name.split()
666 phrasebook_offset[char] = len(phrasebook)
667 for w in w:
668 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000669 if i < short:
670 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000672 # store as two bytes
673 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000674 phrasebook.append(i&255)
675
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000676 assert getsize(phrasebook) == 1
677
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678 #
679 # unicode name hash table
680
681 # extract names
682 data = []
683 for char in unicode.chars:
684 record = unicode.table[char]
685 if record:
686 name = record[1].strip()
687 if name and name[0] != "<":
688 data.append((name, char))
689
690 # the magic number 47 was chosen to minimize the number of
691 # collisions on the current data set. if you like, change it
692 # and see what happens...
693
694 codehash = Hash("code", data, 47)
695
696 print "--- Writing", FILE, "..."
697
698 fp = open(FILE, "w")
699 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
700 print >>fp
701 print >>fp, "#define NAME_MAXLEN", 256
702 print >>fp
703 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000704 Array("lexicon", lexicon).dump(fp, trace)
705 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706
707 # split decomposition index table
708 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
709
710 print >>fp, "/* code->name phrasebook */"
711 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000713
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000714 Array("phrasebook", phrasebook).dump(fp, trace)
715 Array("phrasebook_offset1", offset1).dump(fp, trace)
716 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000717
718 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000719 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000720
721 fp.close()
722
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000723
724def merge_old_version(version, new, old):
725 # Changes to exclusion file not implemented yet
726 if old.exclusions != new.exclusions:
727 raise NotImplementedError, "exclusions differ"
728
729 # In these change records, 0xFF means "no change"
730 bidir_changes = [0xFF]*0x110000
731 category_changes = [0xFF]*0x110000
732 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000733 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000734 # In numeric data, 0 means "no change",
735 # -1 means "did not have a numeric value
736 numeric_changes = [0] * 0x110000
737 # normalization_changes is a list of key-value pairs
738 normalization_changes = []
739 for i in range(0x110000):
740 if new.table[i] is None:
741 # Characters unassigned in the new version ought to
742 # be unassigned in the old one
743 assert old.table[i] is None
744 continue
745 # check characters unassigned in the old version
746 if old.table[i] is None:
747 # category 0 is "unassigned"
748 category_changes[i] = 0
749 continue
750 # check characters that differ
751 if old.table[i] != new.table[i]:
752 for k in range(len(old.table[i])):
753 if old.table[i][k] != new.table[i][k]:
754 value = old.table[i][k]
755 if k == 2:
756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757 category_changes[i] = CATEGORY_NAMES.index(value)
758 elif k == 4:
759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
761 elif k == 5:
762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763 # We assume that all normalization changes are in 1:1 mappings
764 assert " " not in value
765 normalization_changes.append((i, value))
766 elif k == 6:
767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768 # we only support changes where the old value is a single digit
769 assert value in "0123456789"
770 decimal_changes[i] = int(value)
771 elif k == 8:
772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000774 if not value:
775 numeric_changes[i] = -1
776 else:
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000777 numeric_changes[i] = float(value)
778 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000779 elif k == 9:
780 if value == 'Y':
781 mirrored_changes[i] = '1'
782 else:
783 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000784 elif k == 11:
785 # change to ISO comment, ignore
786 pass
787 elif k == 12:
788 # change to simple uppercase mapping; ignore
789 pass
790 elif k == 13:
791 # change to simple lowercase mapping; ignore
792 pass
793 elif k == 14:
794 # change to simple titlecase mapping; ignore
795 pass
796 else:
797 class Difference(Exception):pass
798 raise Difference, (hex(i), k, old.table[i], new.table[i])
799 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000800 decimal_changes, mirrored_changes,
801 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000802 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000803
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000805# --------------------------------------------------------------------
806# the following support code is taken from the unidb utilities
807# Copyright (c) 1999-2000 by Secret Labs AB
808
809# load a unicode-data file from disk
810
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000811class UnicodeData:
812
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000813 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitroue988e282009-04-27 21:53:26 +0000814 derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000815 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000816 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000817 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000818 while 1:
819 s = file.readline()
820 if not s:
821 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000822 s = s.strip().split(";")
823 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000824 table[char] = s
825
Martin v. Löwis97225da2002-11-24 23:05:09 +0000826 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000827 if expand:
828 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000829 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000830 s = table[i]
831 if s:
832 if s[1][-6:] == "First>":
833 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000834 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000835 elif s[1][-5:] == "Last>":
836 s[1] = ""
837 field = None
838 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000839 f2 = field[:]
840 f2[0] = "%X" % i
841 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000842
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000843 # public attributes
844 self.filename = filename
845 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000846 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000847
Martin v. Löwis677bde22002-11-23 22:08:15 +0000848 file = open(exclusions)
849 self.exclusions = {}
850 for s in file:
851 s = s.strip()
852 if not s:
853 continue
854 if s[0] == '#':
855 continue
856 char = int(s.split()[0],16)
857 self.exclusions[char] = 1
858
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000859 widths = [None] * 0x110000
860 for s in open(eastasianwidth):
861 s = s.strip()
862 if not s:
863 continue
864 if s[0] == '#':
865 continue
866 s = s.split()[0].split(';')
867 if '..' in s[0]:
868 first, last = [int(c, 16) for c in s[0].split('..')]
869 chars = range(first, last+1)
870 else:
871 chars = [int(s[0], 16)]
872 for char in chars:
873 widths[char] = s[1]
874 for i in range(0, 0x110000):
875 if table[i] is not None:
876 table[i].append(widths[i])
Antoine Pitroue988e282009-04-27 21:53:26 +0000877 if derivednormalizationprops:
878 quickchecks = [0] * 0x110000 # default is Yes
879 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
880 for s in open(derivednormalizationprops):
881 if '#' in s:
882 s = s[:s.index('#')]
883 s = [i.strip() for i in s.split(';')]
884 if len(s) < 2 or s[1] not in qc_order:
885 continue
886 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
887 quickcheck_shift = qc_order.index(s[1])*2
888 quickcheck <<= quickcheck_shift
889 if '..' not in s[0]:
890 first = last = int(s[0], 16)
891 else:
892 first, last = [int(c, 16) for c in s[0].split('..')]
893 for char in range(first, last+1):
894 assert not (quickchecks[char]>>quickcheck_shift)&3
895 quickchecks[char] |= quickcheck
896 for i in range(0, 0x110000):
897 if table[i] is not None:
898 table[i].append(quickchecks[i])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000899
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000900 for line in open(unihan):
901 if not line.startswith('U+'):
902 continue
903 code, tag, value = line.split(None, 3)[:3]
904 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
905 'kOtherNumeric'):
906 continue
907 value = value.strip().replace(',', '')
908 i = int(code[2:], 16)
909 # Patch the numeric field
910 if table[i] is not None:
911 table[i][8] = value
912
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000913 def uselatin1(self):
914 # restrict character range to ISO Latin 1
915 self.chars = range(256)
916
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000917# hash table tools
918
919# this is a straight-forward reimplementation of Python's built-in
920# dictionary type, using a static data structure, and a custom string
921# hash algorithm.
922
923def myhash(s, magic):
924 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000925 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000926 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000927 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000928 if ix:
929 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
930 return h
931
932SIZES = [
933 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
934 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
935 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
936 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
937]
938
939class Hash:
940 def __init__(self, name, data, magic):
941 # turn a (key, value) list into a static hash table structure
942
943 # determine table size
944 for size, poly in SIZES:
945 if size > len(data):
946 poly = size + poly
947 break
948 else:
949 raise AssertionError, "ran out of polynominals"
950
951 print size, "slots in hash table"
952
953 table = [None] * size
954
955 mask = size-1
956
957 n = 0
958
959 hash = myhash
960
961 # initialize hash table
962 for key, value in data:
963 h = hash(key, magic)
964 i = (~h) & mask
965 v = table[i]
966 if v is None:
967 table[i] = value
968 continue
969 incr = (h ^ (h >> 3)) & mask;
970 if not incr:
971 incr = mask
972 while 1:
973 n = n + 1
974 i = (i + incr) & mask
975 v = table[i]
976 if v is None:
977 table[i] = value
978 break
979 incr = incr << 1
980 if incr > mask:
981 incr = incr ^ poly
982
983 print n, "collisions"
984 self.collisions = n
985
986 for i in range(len(table)):
987 if table[i] is None:
988 table[i] = 0
989
990 self.data = Array(name + "_hash", table)
991 self.magic = magic
992 self.name = name
993 self.size = size
994 self.poly = poly
995
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000996 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000997 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000998 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000999 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1000 file.write("#define %s_size %d\n" % (self.name, self.size))
1001 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1002
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001003# stuff to deal with arrays of unsigned integers
1004
1005class Array:
1006
1007 def __init__(self, name, data):
1008 self.name = name
1009 self.data = data
1010
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001011 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001012 # write data to file, as a C array
1013 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001014 if trace:
1015 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001016 file.write("static ")
1017 if size == 1:
1018 file.write("unsigned char")
1019 elif size == 2:
1020 file.write("unsigned short")
1021 else:
1022 file.write("unsigned int")
1023 file.write(" " + self.name + "[] = {\n")
1024 if self.data:
1025 s = " "
1026 for item in self.data:
1027 i = str(item) + ", "
1028 if len(s) + len(i) > 78:
1029 file.write(s + "\n")
1030 s = " " + i
1031 else:
1032 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001033 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001034 file.write(s + "\n")
1035 file.write("};\n\n")
1036
1037def getsize(data):
1038 # return smallest possible integer size for the given array
1039 maxdata = max(data)
1040 if maxdata < 256:
1041 return 1
1042 elif maxdata < 65536:
1043 return 2
1044 else:
1045 return 4
1046
Tim Peters21013482000-09-25 07:13:41 +00001047def splitbins(t, trace=0):
1048 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1049
1050 t is a sequence of ints. This function can be useful to save space if
1051 many of the ints are the same. t1 and t2 are lists of ints, and shift
1052 is an int, chosen to minimize the combined size of t1 and t2 (in C
1053 code), and where for each i in range(len(t)),
1054 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1055 where mask is a bitmask isolating the last "shift" bits.
1056
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001057 If optional arg trace is non-zero (default zero), progress info
1058 is printed to sys.stderr. The higher the value, the more info
1059 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001060 """
1061
Tim Peters21013482000-09-25 07:13:41 +00001062 if trace:
1063 def dump(t1, t2, shift, bytes):
1064 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1065 len(t1), len(t2), shift, bytes)
1066 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1067 "bytes"
1068 n = len(t)-1 # last valid index
1069 maxshift = 0 # the most we can shift n and still have something left
1070 if n > 0:
1071 while n >> 1:
1072 n >>= 1
1073 maxshift += 1
1074 del n
1075 bytes = sys.maxint # smallest total size so far
1076 t = tuple(t) # so slices can be dict keys
1077 for shift in range(maxshift + 1):
1078 t1 = []
1079 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001080 size = 2**shift
1081 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001082 for i in range(0, len(t), size):
1083 bin = t[i:i+size]
1084 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001085 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001086 index = len(t2)
1087 bincache[bin] = index
1088 t2.extend(bin)
1089 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001090 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001091 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001092 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001093 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001094 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001095 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001096 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001097 t1, t2, shift = best
1098 if trace:
1099 print >>sys.stderr, "Best:",
1100 dump(t1, t2, shift, bytes)
1101 if __debug__:
1102 # exhaustively verify that the decomposition is correct
1103 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1104 for i in xrange(len(t)):
1105 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1106 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001107
1108if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001109 maketables(1)