blob: 3f5ad51969af8ba0136bd4011b8f91cc159d24e1 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Fredrik Lundhcfcea492000-09-25 08:07:06 +000023#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000024# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000025#
26
27import sys
28
29SCRIPT = sys.argv[0]
Martin v. Löwis24329ba2008-09-10 13:38:12 +000030VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000031
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000032# The Unicode Database
Florent Xicluna2e0a53f2010-03-18 21:50:06 +000033UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000037UNIHAN = "Unihan%s.txt"
Antoine Pitroue988e282009-04-27 21:53:26 +000038DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna22b24382010-03-30 08:24:06 +000039LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040
41old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000042
43CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
46 "So" ]
47
48BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
50 "ON" ]
51
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000052EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
53
Florent Xicluna22b24382010-03-30 08:24:06 +000054MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
55
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000056# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000057ALPHA_MASK = 0x01
58DECIMAL_MASK = 0x02
59DIGIT_MASK = 0x04
60LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000061LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000062SPACE_MASK = 0x20
63TITLE_MASK = 0x40
64UPPER_MASK = 0x80
Martin v. Löwis24329ba2008-09-10 13:38:12 +000065NODELTA_MASK = 0x100
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000066NUMERIC_MASK = 0x200
Fredrik Lundhe9133f72000-09-25 17:59:57 +000067
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000068def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070 print "--- Reading", UNICODE_DATA % "", "..."
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000071
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 version = ""
73 unicode = UnicodeData(UNICODE_DATA % version,
74 COMPOSITION_EXCLUSIONS % version,
Antoine Pitroue988e282009-04-27 21:53:26 +000075 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000076 UNIHAN % version,
Florent Xicluna22b24382010-03-30 08:24:06 +000077 DERIVEDNORMALIZATION_PROPS % version,
78 LINE_BREAK % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000079
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000080 print len(filter(None, unicode.table)), "characters"
81
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 for version in old_versions:
83 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
84 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
85 COMPOSITION_EXCLUSIONS % ("-"+version),
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000086 EASTASIAN_WIDTH % ("-"+version),
87 UNIHAN % ("-"+version))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 print len(filter(None, old_unicode.table)), "characters"
89 merge_old_version(version, unicode, old_unicode)
90
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000091 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000092 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000093 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000094
95# --------------------------------------------------------------------
96# unicode character properties
97
98def makeunicodedata(unicode, trace):
99
Antoine Pitroue988e282009-04-27 21:53:26 +0000100 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000101 table = [dummy]
102 cache = {0: dummy}
103 index = [0] * len(unicode.chars)
104
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000105 FILE = "Modules/unicodedata_db.h"
106
107 print "--- Preparing", FILE, "..."
108
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000109 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000110
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000111 for char in unicode.chars:
112 record = unicode.table[char]
113 if record:
114 # extract database properties
115 category = CATEGORY_NAMES.index(record[2])
116 combining = int(record[3])
117 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
118 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000119 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Florent Xicluna22b24382010-03-30 08:24:06 +0000120 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000121 item = (
Antoine Pitroue988e282009-04-27 21:53:26 +0000122 category, combining, bidirectional, mirrored, eastasianwidth,
123 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000124 )
125 # add entry to index and item tables
126 i = cache.get(item)
127 if i is None:
128 cache[item] = i = len(table)
129 table.append(item)
130 index[char] = i
131
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000132 # 2) decomposition data
133
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000134 decomp_data = [0]
135 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000136 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000138
Martin v. Löwis677bde22002-11-23 22:08:15 +0000139 comp_pairs = []
140 comp_first = [None] * len(unicode.chars)
141 comp_last = [None] * len(unicode.chars)
142
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 for char in unicode.chars:
144 record = unicode.table[char]
145 if record:
146 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000147 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000148 if len(decomp) > 19:
149 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 # prefix
151 if decomp[0][0] == "<":
152 prefix = decomp.pop(0)
153 else:
154 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000155 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000156 i = decomp_prefix.index(prefix)
157 except ValueError:
158 i = len(decomp_prefix)
159 decomp_prefix.append(prefix)
160 prefix = i
161 assert prefix < 256
162 # content
Florent Xiclunadc364722010-03-15 14:00:58 +0000163 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000164 # Collect NFC pairs
165 if not prefix and len(decomp) == 3 and \
166 char not in unicode.exclusions and \
167 unicode.table[decomp[1]][3] == "0":
168 p, l, r = decomp
169 comp_first[l] = 1
170 comp_last[r] = 1
171 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 try:
173 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174 except ValueError:
175 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 decomp_data.extend(decomp)
177 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000178 else:
179 i = 0
180 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000181
Martin v. Löwis677bde22002-11-23 22:08:15 +0000182 f = l = 0
183 comp_first_ranges = []
184 comp_last_ranges = []
185 prev_f = prev_l = None
186 for i in unicode.chars:
187 if comp_first[i] is not None:
188 comp_first[i] = f
189 f += 1
190 if prev_f is None:
191 prev_f = (i,i)
192 elif prev_f[1]+1 == i:
193 prev_f = prev_f[0],i
194 else:
195 comp_first_ranges.append(prev_f)
196 prev_f = (i,i)
197 if comp_last[i] is not None:
198 comp_last[i] = l
199 l += 1
200 if prev_l is None:
201 prev_l = (i,i)
202 elif prev_l[1]+1 == i:
203 prev_l = prev_l[0],i
204 else:
205 comp_last_ranges.append(prev_l)
206 prev_l = (i,i)
207 comp_first_ranges.append(prev_f)
208 comp_last_ranges.append(prev_l)
209 total_first = f
210 total_last = l
211
212 comp_data = [0]*(total_first*total_last)
213 for f,l,char in comp_pairs:
214 f = comp_first[f]
215 l = comp_last[l]
216 comp_data[f*total_last+l] = char
217
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000218 print len(table), "unique properties"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219 print len(decomp_prefix), "unique decomposition prefixes"
220 print len(decomp_data), "unique decomposition entries:",
221 print decomp_size, "bytes"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000222 print total_first, "first characters in NFC"
223 print total_last, "last characters in NFC"
224 print len(comp_pairs), "NFC pairs"
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000225
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000226 print "--- Writing", FILE, "..."
227
Fred Drake9c685052000-10-26 03:56:46 +0000228 fp = open(FILE, "w")
229 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
230 print >>fp
Martin v. Löwisb5c980b2002-11-25 09:13:37 +0000231 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
Fred Drake9c685052000-10-26 03:56:46 +0000232 print >>fp, "/* a list of unique database records */"
233 print >>fp, \
234 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000235 for item in table:
Antoine Pitroue988e282009-04-27 21:53:26 +0000236 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
Fred Drake9c685052000-10-26 03:56:46 +0000237 print >>fp, "};"
238 print >>fp
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000239
Martin v. Löwis677bde22002-11-23 22:08:15 +0000240 print >>fp, "/* Reindexing of NFC first characters. */"
241 print >>fp, "#define TOTAL_FIRST",total_first
242 print >>fp, "#define TOTAL_LAST",total_last
243 print >>fp, "struct reindex{int start;short count,index;};"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000244 print >>fp, "static struct reindex nfc_first[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000245 for start,end in comp_first_ranges:
246 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
247 print >>fp," {0,0,0}"
248 print >>fp,"};\n"
Martin v. Löwis111c1802008-06-13 07:47:47 +0000249 print >>fp, "static struct reindex nfc_last[] = {"
Martin v. Löwis677bde22002-11-23 22:08:15 +0000250 for start,end in comp_last_ranges:
251 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
252 print >>fp," {0,0,0}"
253 print >>fp,"};\n"
254
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000255 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000256 # the support code moved into unicodedatabase.c
257
Fred Drake9c685052000-10-26 03:56:46 +0000258 print >>fp, "/* string literals */"
259 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260 for name in CATEGORY_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000261 print >>fp, " \"%s\"," % name
262 print >>fp, " NULL"
263 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000264
Fred Drake9c685052000-10-26 03:56:46 +0000265 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266 for name in BIDIRECTIONAL_NAMES:
Fred Drake9c685052000-10-26 03:56:46 +0000267 print >>fp, " \"%s\"," % name
268 print >>fp, " NULL"
269 print >>fp, "};"
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000270
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000271 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
272 for name in EASTASIANWIDTH_NAMES:
273 print >>fp, " \"%s\"," % name
274 print >>fp, " NULL"
275 print >>fp, "};"
276
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000277 print >>fp, "static const char *decomp_prefix[] = {"
278 for name in decomp_prefix:
Fred Drake9c685052000-10-26 03:56:46 +0000279 print >>fp, " \"%s\"," % name
280 print >>fp, " NULL"
281 print >>fp, "};"
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000282
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000284 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000285
Fred Drake9c685052000-10-26 03:56:46 +0000286 print >>fp, "/* index tables for the database records */"
287 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 Array("index1", index1).dump(fp, trace)
289 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000291 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000292 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000293
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000294 print >>fp, "/* decomposition data */"
295 Array("decomp_data", decomp_data).dump(fp, trace)
296
Fred Drake9c685052000-10-26 03:56:46 +0000297 print >>fp, "/* index tables for the decomposition data */"
298 print >>fp, "#define DECOMP_SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000299 Array("decomp_index1", index1).dump(fp, trace)
300 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Martin v. Löwis677bde22002-11-23 22:08:15 +0000302 index, index2, shift = splitbins(comp_data, trace)
303 print >>fp, "/* NFC pairs */"
304 print >>fp, "#define COMP_SHIFT", shift
305 Array("comp_index", index).dump(fp, trace)
306 Array("comp_data", index2).dump(fp, trace)
307
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000308 # Generate delta tables for old versions
309 for version, table, normalization in unicode.changed:
310 cversion = version.replace(".","_")
311 records = [table[0]]
312 cache = {table[0]:0}
313 index = [0] * len(table)
314 for i, record in enumerate(table):
315 try:
316 index[i] = cache[record]
317 except KeyError:
318 index[i] = cache[record] = len(records)
319 records.append(record)
320 index1, index2, shift = splitbins(index, trace)
321 print >>fp, "static const change_record change_records_%s[] = {" % cversion
322 for record in records:
323 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
324 print >>fp, "};"
325 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
326 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
327 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
328 print >>fp, "{"
329 print >>fp, "\tint index;"
330 print >>fp, "\tif (n >= 0x110000) index = 0;"
331 print >>fp, "\telse {"
332 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
333 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
334 (cversion, shift, ((1<<shift)-1))
335 print >>fp, "\t}"
336 print >>fp, "\treturn change_records_%s+index;" % cversion
337 print >>fp, "}\n"
338 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
339 print >>fp, "{"
340 print >>fp, "\tswitch(n) {"
341 for k, v in normalization:
342 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
343 print >>fp, "\tdefault: return 0;"
344 print >>fp, "\t}\n}\n"
345
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000346 fp.close()
347
348# --------------------------------------------------------------------
349# unicode character type tables
350
351def makeunicodetype(unicode, trace):
352
353 FILE = "Objects/unicodetype_db.h"
354
355 print "--- Preparing", FILE, "..."
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000356
357 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000358 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000359 table = [dummy]
360 cache = {0: dummy}
361 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000362 numeric = {}
363 spaces = []
364 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365
366 for char in unicode.chars:
367 record = unicode.table[char]
368 if record:
369 # extract database properties
370 category = record[2]
371 bidirectional = record[4]
Florent Xicluna22b24382010-03-30 08:24:06 +0000372 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000373 flags = 0
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000374 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
376 flags |= ALPHA_MASK
377 if category == "Ll":
378 flags |= LOWER_MASK
Florent Xicluna22b24382010-03-30 08:24:06 +0000379 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000380 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000381 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382 if category == "Zs" or bidirectional in ("WS", "B", "S"):
383 flags |= SPACE_MASK
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000384 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000385 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 flags |= TITLE_MASK
387 if category == "Lu":
388 flags |= UPPER_MASK
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000389 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000390 if record[12]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000391 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000393 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394 if record[13]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000395 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000397 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 if record[14]:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000399 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 else:
Walter Dörwald5d98ec72009-04-25 14:03:16 +0000401 # UCD.html says that a missing title char means that
402 # it defaults to the uppercase character, not to the
403 # character itself. Apparently, in the current UCD (5.x)
404 # this feature is never used
405 title = upper
406 upper_d = upper - char
407 lower_d = lower - char
408 title_d = title - char
409 if -32768 <= upper_d <= 32767 and \
410 -32768 <= lower_d <= 32767 and \
411 -32768 <= title_d <= 32767:
412 # use deltas
413 upper = upper_d & 0xffff
414 lower = lower_d & 0xffff
415 title = title_d & 0xffff
416 else:
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000417 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000418 # decimal digit, integer digit
419 decimal = 0
420 if record[6]:
421 flags |= DECIMAL_MASK
422 decimal = int(record[6])
423 digit = 0
424 if record[7]:
425 flags |= DIGIT_MASK
426 digit = int(record[7])
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000427 if record[8]:
428 flags |= NUMERIC_MASK
429 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000430 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000431 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000432 )
433 # add entry to index and item tables
434 i = cache.get(item)
435 if i is None:
436 cache[item] = i = len(table)
437 table.append(item)
438 index[char] = i
439
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000440 print len(table), "unique character type entries"
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000441 print sum(map(len, numeric.values())), "numeric code points"
442 print len(spaces), "whitespace code points"
443 print len(linebreaks), "linebreak code points"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000444
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000445 print "--- Writing", FILE, "..."
446
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000447 fp = open(FILE, "w")
Fred Drake9c685052000-10-26 03:56:46 +0000448 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
449 print >>fp
450 print >>fp, "/* a list of unique character type descriptors */"
451 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452 for item in table:
Fred Drake9c685052000-10-26 03:56:46 +0000453 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
454 print >>fp, "};"
455 print >>fp
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000456
457 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000458 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000459
Fred Drake9c685052000-10-26 03:56:46 +0000460 print >>fp, "/* type indexes */"
461 print >>fp, "#define SHIFT", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000462 Array("index1", index1).dump(fp, trace)
463 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000465 # Generate code for _PyUnicode_ToNumeric()
Florent Xiclunadc364722010-03-15 14:00:58 +0000466 numeric_items = sorted(numeric.items())
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000467 print >>fp, '/* Returns the numeric value as double for Unicode characters'
468 print >>fp, ' * having this property, -1.0 otherwise.'
469 print >>fp, ' */'
470 print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
471 print >>fp, '{'
472 print >>fp, ' switch (ch) {'
473 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc5c92d432009-10-13 21:29:34 +0000474 # Turn text into float literals
475 parts = value.split('/')
476 parts = [repr(float(part)) for part in parts]
477 value = '/'.join(parts)
478
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000479 haswide = False
480 hasnonewide = False
481 codepoints.sort()
482 for codepoint in codepoints:
483 if codepoint < 0x10000:
484 hasnonewide = True
485 if codepoint >= 0x10000 and not haswide:
486 print >>fp, '#ifdef Py_UNICODE_WIDE'
487 haswide = True
488 print >>fp, ' case 0x%04X:' % (codepoint,)
489 if haswide and hasnonewide:
490 print >>fp, '#endif'
491 print >>fp, ' return (double) %s;' % (value,)
492 if haswide and not hasnonewide:
493 print >>fp, '#endif'
494 print >>fp,' }'
495 print >>fp,' return -1.0;'
496 print >>fp,'}'
497 print >>fp
498
499 # Generate code for _PyUnicode_IsWhitespace()
500 print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
501 print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
502 print >>fp, " */"
503 print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
504 print >>fp, '{'
505 print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
506 print >>fp, ' return iswspace(ch);'
507 print >>fp, '#else'
508 print >>fp, ' switch (ch) {'
509
510 haswide = False
511 hasnonewide = False
Florent Xiclunadc364722010-03-15 14:00:58 +0000512 for codepoint in sorted(spaces):
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000513 if codepoint < 0x10000:
514 hasnonewide = True
515 if codepoint >= 0x10000 and not haswide:
516 print >>fp, '#ifdef Py_UNICODE_WIDE'
517 haswide = True
518 print >>fp, ' case 0x%04X:' % (codepoint,)
519 if haswide and hasnonewide:
520 print >>fp, '#endif'
521 print >>fp, ' return 1;'
522 if haswide and not hasnonewide:
523 print >>fp, '#endif'
524
525 print >>fp,' }'
526 print >>fp,' return 0;'
527 print >>fp, '#endif'
528 print >>fp,'}'
529 print >>fp
530
531 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna22b24382010-03-30 08:24:06 +0000532 print >>fp, "/* Returns 1 for Unicode characters having the line break"
533 print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
534 print >>fp, " * type 'B', 0 otherwise."
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000535 print >>fp, " */"
536 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
537 print >>fp, '{'
538 print >>fp, ' switch (ch) {'
539 haswide = False
540 hasnonewide = False
Florent Xiclunadc364722010-03-15 14:00:58 +0000541 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000542 if codepoint < 0x10000:
543 hasnonewide = True
544 if codepoint >= 0x10000 and not haswide:
545 print >>fp, '#ifdef Py_UNICODE_WIDE'
546 haswide = True
547 print >>fp, ' case 0x%04X:' % (codepoint,)
548 if haswide and hasnonewide:
549 print >>fp, '#endif'
550 print >>fp, ' return 1;'
551 if haswide and not hasnonewide:
552 print >>fp, '#endif'
553
554 print >>fp,' }'
555 print >>fp,' return 0;'
556 print >>fp,'}'
557 print >>fp
558
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000559 fp.close()
560
561# --------------------------------------------------------------------
562# unicode name database
563
564def makeunicodename(unicode, trace):
565
566 FILE = "Modules/unicodename_db.h"
567
568 print "--- Preparing", FILE, "..."
569
570 # collect names
571 names = [None] * len(unicode.chars)
572
573 for char in unicode.chars:
574 record = unicode.table[char]
575 if record:
576 name = record[1].strip()
577 if name and name[0] != "<":
578 names[char] = name + chr(0)
579
580 print len(filter(lambda n: n is not None, names)), "distinct names"
581
582 # collect unique words from names (note that we differ between
583 # words inside a sentence, and words ending a sentence. the
584 # latter includes the trailing null byte.
585
586 words = {}
587 n = b = 0
588 for char in unicode.chars:
589 name = names[char]
590 if name:
591 w = name.split()
592 b = b + len(name)
593 n = n + len(w)
594 for w in w:
595 l = words.get(w)
596 if l:
597 l.append(None)
598 else:
599 words[w] = [len(words)]
600
601 print n, "words in text;", b, "bytes"
602
603 wordlist = words.items()
604
Martin v. Löwis97225da2002-11-24 23:05:09 +0000605 # sort on falling frequency, then by name
Florent Xiclunadc364722010-03-15 14:00:58 +0000606 def word_key(a):
607 aword, alist = a
608 return -len(alist), aword
609 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000610
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000611 # figure out how many phrasebook escapes we need
612 escapes = 0
613 while escapes * 256 < len(wordlist):
614 escapes = escapes + 1
615 print escapes, "escapes"
616
617 short = 256 - escapes
618
619 assert short > 0
620
621 print short, "short indexes in lexicon"
622
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000623 # statistics
624 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000625 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000626 n = n + len(wordlist[i][1])
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000627 print n, "short indexes in phrasebook"
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629 # pick the most commonly used words, and sort the rest on falling
630 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000631
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000632 wordlist, wordtail = wordlist[:short], wordlist[short:]
Florent Xiclunadc364722010-03-15 14:00:58 +0000633 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000634 wordlist.extend(wordtail)
635
636 # generate lexicon from words
637
638 lexicon_offset = [0]
639 lexicon = ""
640 words = {}
641
642 # build a lexicon string
643 offset = 0
644 for w, x in wordlist:
645 # encoding: bit 7 indicates last character in word (chr(128)
646 # indicates the last character in an entire string)
647 ww = w[:-1] + chr(ord(w[-1])+128)
648 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000649 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650 if o < 0:
651 o = offset
652 lexicon = lexicon + ww
653 offset = offset + len(w)
654 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000656
657 lexicon = map(ord, lexicon)
658
659 # generate phrasebook from names and lexicon
660 phrasebook = [0]
661 phrasebook_offset = [0] * len(unicode.chars)
662 for char in unicode.chars:
663 name = names[char]
664 if name:
665 w = name.split()
666 phrasebook_offset[char] = len(phrasebook)
667 for w in w:
668 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000669 if i < short:
670 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000672 # store as two bytes
673 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000674 phrasebook.append(i&255)
675
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000676 assert getsize(phrasebook) == 1
677
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678 #
679 # unicode name hash table
680
681 # extract names
682 data = []
683 for char in unicode.chars:
684 record = unicode.table[char]
685 if record:
686 name = record[1].strip()
687 if name and name[0] != "<":
688 data.append((name, char))
689
690 # the magic number 47 was chosen to minimize the number of
691 # collisions on the current data set. if you like, change it
692 # and see what happens...
693
694 codehash = Hash("code", data, 47)
695
696 print "--- Writing", FILE, "..."
697
698 fp = open(FILE, "w")
699 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
700 print >>fp
701 print >>fp, "#define NAME_MAXLEN", 256
702 print >>fp
703 print >>fp, "/* lexicon */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000704 Array("lexicon", lexicon).dump(fp, trace)
705 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706
707 # split decomposition index table
708 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
709
710 print >>fp, "/* code->name phrasebook */"
711 print >>fp, "#define phrasebook_shift", shift
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000712 print >>fp, "#define phrasebook_short", short
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000713
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000714 Array("phrasebook", phrasebook).dump(fp, trace)
715 Array("phrasebook_offset1", offset1).dump(fp, trace)
716 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000717
718 print >>fp, "/* name->code dictionary */"
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000719 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000720
721 fp.close()
722
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000723
724def merge_old_version(version, new, old):
725 # Changes to exclusion file not implemented yet
726 if old.exclusions != new.exclusions:
727 raise NotImplementedError, "exclusions differ"
728
729 # In these change records, 0xFF means "no change"
730 bidir_changes = [0xFF]*0x110000
731 category_changes = [0xFF]*0x110000
732 decimal_changes = [0xFF]*0x110000
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000733 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000734 # In numeric data, 0 means "no change",
735 # -1 means "did not have a numeric value
736 numeric_changes = [0] * 0x110000
737 # normalization_changes is a list of key-value pairs
738 normalization_changes = []
739 for i in range(0x110000):
740 if new.table[i] is None:
741 # Characters unassigned in the new version ought to
742 # be unassigned in the old one
743 assert old.table[i] is None
744 continue
745 # check characters unassigned in the old version
746 if old.table[i] is None:
747 # category 0 is "unassigned"
748 category_changes[i] = 0
749 continue
750 # check characters that differ
751 if old.table[i] != new.table[i]:
752 for k in range(len(old.table[i])):
753 if old.table[i][k] != new.table[i][k]:
754 value = old.table[i][k]
755 if k == 2:
756 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
757 category_changes[i] = CATEGORY_NAMES.index(value)
758 elif k == 4:
759 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
760 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
761 elif k == 5:
762 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
763 # We assume that all normalization changes are in 1:1 mappings
764 assert " " not in value
765 normalization_changes.append((i, value))
766 elif k == 6:
767 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
768 # we only support changes where the old value is a single digit
769 assert value in "0123456789"
770 decimal_changes[i] = int(value)
771 elif k == 8:
772 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
773 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000774 if not value:
775 numeric_changes[i] = -1
776 else:
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000777 numeric_changes[i] = float(value)
778 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000779 elif k == 9:
780 if value == 'Y':
781 mirrored_changes[i] = '1'
782 else:
783 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000784 elif k == 11:
785 # change to ISO comment, ignore
786 pass
787 elif k == 12:
788 # change to simple uppercase mapping; ignore
789 pass
790 elif k == 13:
791 # change to simple lowercase mapping; ignore
792 pass
793 elif k == 14:
794 # change to simple titlecase mapping; ignore
795 pass
Florent Xicluna22b24382010-03-30 08:24:06 +0000796 elif k == 16:
797 # change to properties; not yet
798 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799 else:
800 class Difference(Exception):pass
801 raise Difference, (hex(i), k, old.table[i], new.table[i])
802 new.changed.append((version, zip(bidir_changes, category_changes,
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000803 decimal_changes, mirrored_changes,
804 numeric_changes),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000805 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000806
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000807
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000808# --------------------------------------------------------------------
809# the following support code is taken from the unidb utilities
810# Copyright (c) 1999-2000 by Secret Labs AB
811
812# load a unicode-data file from disk
813
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000814class UnicodeData:
Florent Xicluna22b24382010-03-30 08:24:06 +0000815 # Record structure:
816 # [ID, name, category, combining, bidi, decomp, (6)
817 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
818 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
819 # properties] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000820
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000821 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Florent Xicluna22b24382010-03-30 08:24:06 +0000822 derivednormalizationprops=None, linebreakprops=None,
823 expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000824 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000825 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000826 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000827 while 1:
828 s = file.readline()
829 if not s:
830 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000831 s = s.strip().split(";")
832 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000833 table[char] = s
834
Martin v. Löwis97225da2002-11-24 23:05:09 +0000835 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000836 if expand:
837 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000838 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000839 s = table[i]
840 if s:
841 if s[1][-6:] == "First>":
842 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000844 elif s[1][-5:] == "Last>":
845 s[1] = ""
846 field = None
847 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 f2 = field[:]
849 f2[0] = "%X" % i
850 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000851
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000852 # public attributes
853 self.filename = filename
854 self.table = table
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000855 self.chars = range(0x110000) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000856
Martin v. Löwis677bde22002-11-23 22:08:15 +0000857 file = open(exclusions)
858 self.exclusions = {}
859 for s in file:
860 s = s.strip()
861 if not s:
862 continue
863 if s[0] == '#':
864 continue
865 char = int(s.split()[0],16)
866 self.exclusions[char] = 1
867
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000868 widths = [None] * 0x110000
869 for s in open(eastasianwidth):
870 s = s.strip()
871 if not s:
872 continue
873 if s[0] == '#':
874 continue
875 s = s.split()[0].split(';')
876 if '..' in s[0]:
877 first, last = [int(c, 16) for c in s[0].split('..')]
878 chars = range(first, last+1)
879 else:
880 chars = [int(s[0], 16)]
881 for char in chars:
882 widths[char] = s[1]
883 for i in range(0, 0x110000):
884 if table[i] is not None:
885 table[i].append(widths[i])
Florent Xicluna22b24382010-03-30 08:24:06 +0000886
887 for i in range(0, 0x110000):
888 if table[i] is not None:
889 table[i].append(set())
890 if linebreakprops:
891 for s in open(linebreakprops):
892 s = s.partition('#')[0]
893 s = [i.strip() for i in s.split(';')]
894 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
895 continue
896 if '..' not in s[0]:
897 first = last = int(s[0], 16)
898 else:
899 first, last = [int(c, 16) for c in s[0].split('..')]
900 for char in range(first, last+1):
901 table[char][-1].add('Line_Break')
902
Antoine Pitroue988e282009-04-27 21:53:26 +0000903 if derivednormalizationprops:
904 quickchecks = [0] * 0x110000 # default is Yes
905 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
906 for s in open(derivednormalizationprops):
907 if '#' in s:
908 s = s[:s.index('#')]
909 s = [i.strip() for i in s.split(';')]
910 if len(s) < 2 or s[1] not in qc_order:
911 continue
912 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
913 quickcheck_shift = qc_order.index(s[1])*2
914 quickcheck <<= quickcheck_shift
915 if '..' not in s[0]:
916 first = last = int(s[0], 16)
917 else:
918 first, last = [int(c, 16) for c in s[0].split('..')]
919 for char in range(first, last+1):
920 assert not (quickchecks[char]>>quickcheck_shift)&3
921 quickchecks[char] |= quickcheck
922 for i in range(0, 0x110000):
923 if table[i] is not None:
924 table[i].append(quickchecks[i])
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000925
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +0000926 for line in open(unihan):
927 if not line.startswith('U+'):
928 continue
929 code, tag, value = line.split(None, 3)[:3]
930 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
931 'kOtherNumeric'):
932 continue
933 value = value.strip().replace(',', '')
934 i = int(code[2:], 16)
935 # Patch the numeric field
936 if table[i] is not None:
937 table[i][8] = value
938
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000939 def uselatin1(self):
940 # restrict character range to ISO Latin 1
941 self.chars = range(256)
942
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000943# hash table tools
944
945# this is a straight-forward reimplementation of Python's built-in
946# dictionary type, using a static data structure, and a custom string
947# hash algorithm.
948
949def myhash(s, magic):
950 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000951 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000952 h = (h * magic) + c
Martin v. Löwis97225da2002-11-24 23:05:09 +0000953 ix = h & 0xff000000L
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000954 if ix:
955 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
956 return h
957
958SIZES = [
959 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
960 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
961 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
962 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
963]
964
965class Hash:
966 def __init__(self, name, data, magic):
967 # turn a (key, value) list into a static hash table structure
968
969 # determine table size
970 for size, poly in SIZES:
971 if size > len(data):
972 poly = size + poly
973 break
974 else:
975 raise AssertionError, "ran out of polynominals"
976
977 print size, "slots in hash table"
978
979 table = [None] * size
980
981 mask = size-1
982
983 n = 0
984
985 hash = myhash
986
987 # initialize hash table
988 for key, value in data:
989 h = hash(key, magic)
990 i = (~h) & mask
991 v = table[i]
992 if v is None:
993 table[i] = value
994 continue
995 incr = (h ^ (h >> 3)) & mask;
996 if not incr:
997 incr = mask
998 while 1:
999 n = n + 1
1000 i = (i + incr) & mask
1001 v = table[i]
1002 if v is None:
1003 table[i] = value
1004 break
1005 incr = incr << 1
1006 if incr > mask:
1007 incr = incr ^ poly
1008
1009 print n, "collisions"
1010 self.collisions = n
1011
1012 for i in range(len(table)):
1013 if table[i] is None:
1014 table[i] = 0
1015
1016 self.data = Array(name + "_hash", table)
1017 self.magic = magic
1018 self.name = name
1019 self.size = size
1020 self.poly = poly
1021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001022 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001023 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001024 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001025 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1026 file.write("#define %s_size %d\n" % (self.name, self.size))
1027 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1028
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001029# stuff to deal with arrays of unsigned integers
1030
1031class Array:
1032
1033 def __init__(self, name, data):
1034 self.name = name
1035 self.data = data
1036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001037 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001038 # write data to file, as a C array
1039 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001040 if trace:
1041 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001042 file.write("static ")
1043 if size == 1:
1044 file.write("unsigned char")
1045 elif size == 2:
1046 file.write("unsigned short")
1047 else:
1048 file.write("unsigned int")
1049 file.write(" " + self.name + "[] = {\n")
1050 if self.data:
1051 s = " "
1052 for item in self.data:
1053 i = str(item) + ", "
1054 if len(s) + len(i) > 78:
1055 file.write(s + "\n")
1056 s = " " + i
1057 else:
1058 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001059 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001060 file.write(s + "\n")
1061 file.write("};\n\n")
1062
1063def getsize(data):
1064 # return smallest possible integer size for the given array
1065 maxdata = max(data)
1066 if maxdata < 256:
1067 return 1
1068 elif maxdata < 65536:
1069 return 2
1070 else:
1071 return 4
1072
Tim Peters21013482000-09-25 07:13:41 +00001073def splitbins(t, trace=0):
1074 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1075
1076 t is a sequence of ints. This function can be useful to save space if
1077 many of the ints are the same. t1 and t2 are lists of ints, and shift
1078 is an int, chosen to minimize the combined size of t1 and t2 (in C
1079 code), and where for each i in range(len(t)),
1080 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1081 where mask is a bitmask isolating the last "shift" bits.
1082
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001083 If optional arg trace is non-zero (default zero), progress info
1084 is printed to sys.stderr. The higher the value, the more info
1085 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001086 """
1087
Tim Peters21013482000-09-25 07:13:41 +00001088 if trace:
1089 def dump(t1, t2, shift, bytes):
1090 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
1091 len(t1), len(t2), shift, bytes)
1092 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
1093 "bytes"
1094 n = len(t)-1 # last valid index
1095 maxshift = 0 # the most we can shift n and still have something left
1096 if n > 0:
1097 while n >> 1:
1098 n >>= 1
1099 maxshift += 1
1100 del n
1101 bytes = sys.maxint # smallest total size so far
1102 t = tuple(t) # so slices can be dict keys
1103 for shift in range(maxshift + 1):
1104 t1 = []
1105 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001106 size = 2**shift
1107 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001108 for i in range(0, len(t), size):
1109 bin = t[i:i+size]
1110 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001111 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001112 index = len(t2)
1113 bincache[bin] = index
1114 t2.extend(bin)
1115 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001116 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001117 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001118 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001119 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001120 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001121 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001122 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001123 t1, t2, shift = best
1124 if trace:
1125 print >>sys.stderr, "Best:",
1126 dump(t1, t2, shift, bytes)
1127 if __debug__:
1128 # exhaustively verify that the decomposition is correct
1129 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1130 for i in xrange(len(t)):
1131 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1132 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001133
1134if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001135 maketables(1)