blob: 439a45b57e63e0a07aade3188d0de0edc87e390d [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis93cbca32008-09-10 14:08:48 +000034UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041
42old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000043
44CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
45 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
46 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
47 "So" ]
48
49BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
50 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
51 "ON" ]
52
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000053EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
54
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000055# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000056ALPHA_MASK = 0x01
57DECIMAL_MASK = 0x02
58DIGIT_MASK = 0x04
59LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000060LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000061SPACE_MASK = 0x20
62TITLE_MASK = 0x40
63UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000064XID_START_MASK = 0x100
65XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000066PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000067NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000068NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000069
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000070def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000071
Collin Winter6afaeb72007-08-03 17:06:41 +000072 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000073
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074 version = ""
75 unicode = UnicodeData(UNICODE_DATA % version,
76 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000077 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000078 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000079 DERIVED_CORE_PROPERTIES % version,
80 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000081
Georg Brandl559e5d72008-06-11 18:37:52 +000082 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000083
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000085 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
87 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000088 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000089 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000090 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000091 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000092 merge_old_version(version, unicode, old_unicode)
93
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000094 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000095 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000096 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000097
98# --------------------------------------------------------------------
99# unicode character properties
100
101def makeunicodedata(unicode, trace):
102
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000103 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104 table = [dummy]
105 cache = {0: dummy}
106 index = [0] * len(unicode.chars)
107
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000108 FILE = "Modules/unicodedata_db.h"
109
Collin Winter6afaeb72007-08-03 17:06:41 +0000110 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000112 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000113
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114 for char in unicode.chars:
115 record = unicode.table[char]
116 if record:
117 # extract database properties
118 category = CATEGORY_NAMES.index(record[2])
119 combining = int(record[3])
120 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
121 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000122 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000123 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000124 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000125 category, combining, bidirectional, mirrored, eastasianwidth,
126 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127 )
128 # add entry to index and item tables
129 i = cache.get(item)
130 if i is None:
131 cache[item] = i = len(table)
132 table.append(item)
133 index[char] = i
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 # 2) decomposition data
136
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 decomp_data = [0]
138 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141
Martin v. Löwis677bde22002-11-23 22:08:15 +0000142 comp_pairs = []
143 comp_first = [None] * len(unicode.chars)
144 comp_last = [None] * len(unicode.chars)
145
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000146 for char in unicode.chars:
147 record = unicode.table[char]
148 if record:
149 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000150 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000151 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000152 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 # prefix
154 if decomp[0][0] == "<":
155 prefix = decomp.pop(0)
156 else:
157 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000158 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000159 i = decomp_prefix.index(prefix)
160 except ValueError:
161 i = len(decomp_prefix)
162 decomp_prefix.append(prefix)
163 prefix = i
164 assert prefix < 256
165 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000166 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 # Collect NFC pairs
168 if not prefix and len(decomp) == 3 and \
169 char not in unicode.exclusions and \
170 unicode.table[decomp[1]][3] == "0":
171 p, l, r = decomp
172 comp_first[l] = 1
173 comp_last[r] = 1
174 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175 try:
176 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000177 except ValueError:
178 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 decomp_data.extend(decomp)
180 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 else:
182 i = 0
183 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000184
Martin v. Löwis677bde22002-11-23 22:08:15 +0000185 f = l = 0
186 comp_first_ranges = []
187 comp_last_ranges = []
188 prev_f = prev_l = None
189 for i in unicode.chars:
190 if comp_first[i] is not None:
191 comp_first[i] = f
192 f += 1
193 if prev_f is None:
194 prev_f = (i,i)
195 elif prev_f[1]+1 == i:
196 prev_f = prev_f[0],i
197 else:
198 comp_first_ranges.append(prev_f)
199 prev_f = (i,i)
200 if comp_last[i] is not None:
201 comp_last[i] = l
202 l += 1
203 if prev_l is None:
204 prev_l = (i,i)
205 elif prev_l[1]+1 == i:
206 prev_l = prev_l[0],i
207 else:
208 comp_last_ranges.append(prev_l)
209 prev_l = (i,i)
210 comp_first_ranges.append(prev_f)
211 comp_last_ranges.append(prev_l)
212 total_first = f
213 total_last = l
214
215 comp_data = [0]*(total_first*total_last)
216 for f,l,char in comp_pairs:
217 f = comp_first[f]
218 l = comp_last[l]
219 comp_data[f*total_last+l] = char
220
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print(len(table), "unique properties")
222 print(len(decomp_prefix), "unique decomposition prefixes")
223 print(len(decomp_data), "unique decomposition entries:", end=' ')
224 print(decomp_size, "bytes")
225 print(total_first, "first characters in NFC")
226 print(total_last, "last characters in NFC")
227 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000228
Collin Winter6afaeb72007-08-03 17:06:41 +0000229 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000230
Fred Drake9c685052000-10-26 03:56:46 +0000231 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
233 print(file=fp)
234 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
235 print("/* a list of unique database records */", file=fp)
236 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000238 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000239 print("};", file=fp)
240 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print("/* Reindexing of NFC first characters. */", file=fp)
243 print("#define TOTAL_FIRST",total_first, file=fp)
244 print("#define TOTAL_LAST",total_last, file=fp)
245 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000246 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000248 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
249 print(" {0,0,0}", file=fp)
250 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000251 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000252 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
254 print(" {0,0,0}", file=fp)
255 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000257 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000258 # the support code moved into unicodedatabase.c
259
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print("/* string literals */", file=fp)
261 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print(" \"%s\"," % name, file=fp)
264 print(" NULL", file=fp)
265 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print(" \"%s\"," % name, file=fp)
270 print(" NULL", file=fp)
271 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000274 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000275 print(" \"%s\"," % name, file=fp)
276 print(" NULL", file=fp)
277 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000281 print(" \"%s\"," % name, file=fp)
282 print(" NULL", file=fp)
283 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000284
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000285 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000286 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* index tables for the database records */", file=fp)
289 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 Array("index1", index1).dump(fp, trace)
291 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000293 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000294 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000295
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000297 Array("decomp_data", decomp_data).dump(fp, trace)
298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* index tables for the decomposition data */", file=fp)
300 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_index1", index1).dump(fp, trace)
302 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000303
Martin v. Löwis677bde22002-11-23 22:08:15 +0000304 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print("/* NFC pairs */", file=fp)
306 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000307 Array("comp_index", index).dump(fp, trace)
308 Array("comp_data", index2).dump(fp, trace)
309
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000310 # Generate delta tables for old versions
311 for version, table, normalization in unicode.changed:
312 cversion = version.replace(".","_")
313 records = [table[0]]
314 cache = {table[0]:0}
315 index = [0] * len(table)
316 for i, record in enumerate(table):
317 try:
318 index[i] = cache[record]
319 except KeyError:
320 index[i] = cache[record] = len(records)
321 records.append(record)
322 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
326 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
328 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
330 print("{", file=fp)
331 print("\tint index;", file=fp)
332 print("\tif (n >= 0x110000) index = 0;", file=fp)
333 print("\telse {", file=fp)
334 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
335 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
336 (cversion, shift, ((1<<shift)-1)), file=fp)
337 print("\t}", file=fp)
338 print("\treturn change_records_%s+index;" % cversion, file=fp)
339 print("}\n", file=fp)
340 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
341 print("{", file=fp)
342 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000344 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
345 print("\tdefault: return 0;", file=fp)
346 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000348 fp.close()
349
350# --------------------------------------------------------------------
351# unicode character type tables
352
353def makeunicodetype(unicode, trace):
354
355 FILE = "Objects/unicodetype_db.h"
356
Collin Winter6afaeb72007-08-03 17:06:41 +0000357 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000358
359 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000360 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361 table = [dummy]
362 cache = {0: dummy}
363 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000364 numeric = {}
365 spaces = []
366 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000367
368 for char in unicode.chars:
369 record = unicode.table[char]
370 if record:
371 # extract database properties
372 category = record[2]
373 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000374 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000376 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
378 flags |= ALPHA_MASK
379 if category == "Ll":
380 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000381 if category == "Zl" or bidirectional == "B":
382 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000383 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000384 if category == "Zs" or bidirectional in ("WS", "B", "S"):
385 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000386 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000387 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 flags |= TITLE_MASK
389 if category == "Lu":
390 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000391 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000392 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000393 if "XID_Start" in properties:
394 flags |= XID_START_MASK
395 if "XID_Continue" in properties:
396 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000397 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000399 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000401 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 # UCD.html says that a missing title char means that
410 # it defaults to the uppercase character, not to the
411 # character itself. Apparently, in the current UCD (5.x)
412 # this feature is never used
413 title = upper
414 upper_d = upper - char
415 lower_d = lower - char
416 title_d = title - char
417 if -32768 <= upper_d <= 32767 and \
418 -32768 <= lower_d <= 32767 and \
419 -32768 <= title_d <= 32767:
420 # use deltas
421 upper = upper_d & 0xffff
422 lower = lower_d & 0xffff
423 title = title_d & 0xffff
424 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000425 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000426 # decimal digit, integer digit
427 decimal = 0
428 if record[6]:
429 flags |= DECIMAL_MASK
430 decimal = int(record[6])
431 digit = 0
432 if record[7]:
433 flags |= DIGIT_MASK
434 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000435 if record[8]:
436 flags |= NUMERIC_MASK
437 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000438 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000439 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440 )
441 # add entry to index and item tables
442 i = cache.get(item)
443 if i is None:
444 cache[item] = i = len(table)
445 table.append(item)
446 index[char] = i
447
Collin Winter6afaeb72007-08-03 17:06:41 +0000448 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000449 print(sum(map(len, numeric.values())), "numeric code points")
450 print(len(spaces), "whitespace code points")
451 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
Collin Winter6afaeb72007-08-03 17:06:41 +0000453 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000454
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000455 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
457 print(file=fp)
458 print("/* a list of unique character type descriptors */", file=fp)
459 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000461 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
462 print("};", file=fp)
463 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464
465 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000466 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000467
Collin Winter6afaeb72007-08-03 17:06:41 +0000468 print("/* type indexes */", file=fp)
469 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000470 Array("index1", index1).dump(fp, trace)
471 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000472
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000473 # Generate code for _PyUnicode_ToNumeric()
474 numeric_items = sorted(numeric.items())
475 print('/* Returns the numeric value as double for Unicode characters', file=fp)
476 print(' * having this property, -1.0 otherwise.', file=fp)
477 print(' */', file=fp)
478 print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
479 print('{', file=fp)
480 print(' switch (ch) {', file=fp)
481 for value, codepoints in numeric_items:
482 haswide = False
483 hasnonewide = False
484 codepoints.sort()
485 for codepoint in codepoints:
486 if codepoint < 0x10000:
487 hasnonewide = True
488 if codepoint >= 0x10000 and not haswide:
489 print('#ifdef Py_UNICODE_WIDE', file=fp)
490 haswide = True
491 print(' case 0x%04X:' % (codepoint,), file=fp)
492 if haswide and hasnonewide:
493 print('#endif', file=fp)
494 print(' return (double) %s;' % (value,), file=fp)
495 if haswide and not hasnonewide:
496 print('#endif', file=fp)
497 print(' }', file=fp)
498 print(' return -1.0;', file=fp)
499 print('}', file=fp)
500 print(file=fp)
501
502 # Generate code for _PyUnicode_IsWhitespace()
503 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
504 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
505 print(" */", file=fp)
506 print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
507 print('{', file=fp)
508 print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
509 print(' return iswspace(ch);', file=fp)
510 print('#else', file=fp)
511 print(' switch (ch) {', file=fp)
512
513 haswide = False
514 hasnonewide = False
515 spaces.sort()
516 for codepoint in spaces:
517 if codepoint < 0x10000:
518 hasnonewide = True
519 if codepoint >= 0x10000 and not haswide:
520 print('#ifdef Py_UNICODE_WIDE', file=fp)
521 haswide = True
522 print(' case 0x%04X:' % (codepoint,), file=fp)
523 if haswide and hasnonewide:
524 print('#endif', file=fp)
525 print(' return 1;', file=fp)
526 if haswide and not hasnonewide:
527 print('#endif', file=fp)
528
529 print(' }', file=fp)
530 print(' return 0;', file=fp)
531 print('#endif', file=fp)
532 print('}', file=fp)
533 print(file=fp)
534
535 # Generate code for _PyUnicode_IsLinebreak()
536 print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
537 print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
538 print(" */", file=fp)
539 print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
540 print('{', file=fp)
541 print(' switch (ch) {', file=fp)
542 haswide = False
543 hasnonewide = False
544 linebreaks.sort()
545 for codepoint in linebreaks:
546 if codepoint < 0x10000:
547 hasnonewide = True
548 if codepoint >= 0x10000 and not haswide:
549 print('#ifdef Py_UNICODE_WIDE', file=fp)
550 haswide = True
551 print(' case 0x%04X:' % (codepoint,), file=fp)
552 if haswide and hasnonewide:
553 print('#endif', file=fp)
554 print(' return 1;', file=fp)
555 if haswide and not hasnonewide:
556 print('#endif', file=fp)
557
558 print(' }', file=fp)
559 print(' return 0;', file=fp)
560 print('}', file=fp)
561 print(file=fp)
562
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000563 fp.close()
564
565# --------------------------------------------------------------------
566# unicode name database
567
568def makeunicodename(unicode, trace):
569
570 FILE = "Modules/unicodename_db.h"
571
Collin Winter6afaeb72007-08-03 17:06:41 +0000572 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000573
574 # collect names
575 names = [None] * len(unicode.chars)
576
577 for char in unicode.chars:
578 record = unicode.table[char]
579 if record:
580 name = record[1].strip()
581 if name and name[0] != "<":
582 names[char] = name + chr(0)
583
Georg Brandl559e5d72008-06-11 18:37:52 +0000584 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000585
586 # collect unique words from names (note that we differ between
587 # words inside a sentence, and words ending a sentence. the
588 # latter includes the trailing null byte.
589
590 words = {}
591 n = b = 0
592 for char in unicode.chars:
593 name = names[char]
594 if name:
595 w = name.split()
596 b = b + len(name)
597 n = n + len(w)
598 for w in w:
599 l = words.get(w)
600 if l:
601 l.append(None)
602 else:
603 words[w] = [len(words)]
604
Collin Winter6afaeb72007-08-03 17:06:41 +0000605 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000607 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608
Martin v. Löwis97225da2002-11-24 23:05:09 +0000609 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000610 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000611 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000612 return -len(alist), aword
613 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000614
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000615 # figure out how many phrasebook escapes we need
616 escapes = 0
617 while escapes * 256 < len(wordlist):
618 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000619 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000620
621 short = 256 - escapes
622
623 assert short > 0
624
Collin Winter6afaeb72007-08-03 17:06:41 +0000625 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000626
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000627 # statistics
628 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000631 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000632
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000633 # pick the most commonly used words, and sort the rest on falling
634 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000636 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000637 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638 wordlist.extend(wordtail)
639
640 # generate lexicon from words
641
642 lexicon_offset = [0]
643 lexicon = ""
644 words = {}
645
646 # build a lexicon string
647 offset = 0
648 for w, x in wordlist:
649 # encoding: bit 7 indicates last character in word (chr(128)
650 # indicates the last character in an entire string)
651 ww = w[:-1] + chr(ord(w[-1])+128)
652 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000653 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000654 if o < 0:
655 o = offset
656 lexicon = lexicon + ww
657 offset = offset + len(w)
658 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000661 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662
663 # generate phrasebook from names and lexicon
664 phrasebook = [0]
665 phrasebook_offset = [0] * len(unicode.chars)
666 for char in unicode.chars:
667 name = names[char]
668 if name:
669 w = name.split()
670 phrasebook_offset[char] = len(phrasebook)
671 for w in w:
672 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000673 if i < short:
674 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000675 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000676 # store as two bytes
677 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678 phrasebook.append(i&255)
679
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000680 assert getsize(phrasebook) == 1
681
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682 #
683 # unicode name hash table
684
685 # extract names
686 data = []
687 for char in unicode.chars:
688 record = unicode.table[char]
689 if record:
690 name = record[1].strip()
691 if name and name[0] != "<":
692 data.append((name, char))
693
694 # the magic number 47 was chosen to minimize the number of
695 # collisions on the current data set. if you like, change it
696 # and see what happens...
697
698 codehash = Hash("code", data, 47)
699
Collin Winter6afaeb72007-08-03 17:06:41 +0000700 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701
702 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000703 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
704 print(file=fp)
705 print("#define NAME_MAXLEN", 256, file=fp)
706 print(file=fp)
707 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 Array("lexicon", lexicon).dump(fp, trace)
709 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710
711 # split decomposition index table
712 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
713
Collin Winter6afaeb72007-08-03 17:06:41 +0000714 print("/* code->name phrasebook */", file=fp)
715 print("#define phrasebook_shift", shift, file=fp)
716 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000717
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000718 Array("phrasebook", phrasebook).dump(fp, trace)
719 Array("phrasebook_offset1", offset1).dump(fp, trace)
720 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000721
Collin Winter6afaeb72007-08-03 17:06:41 +0000722 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000723 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724
725 fp.close()
726
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000727
728def merge_old_version(version, new, old):
729 # Changes to exclusion file not implemented yet
730 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000731 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000732
733 # In these change records, 0xFF means "no change"
734 bidir_changes = [0xFF]*0x110000
735 category_changes = [0xFF]*0x110000
736 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000737 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000738 # In numeric data, 0 means "no change",
739 # -1 means "did not have a numeric value
740 numeric_changes = [0] * 0x110000
741 # normalization_changes is a list of key-value pairs
742 normalization_changes = []
743 for i in range(0x110000):
744 if new.table[i] is None:
745 # Characters unassigned in the new version ought to
746 # be unassigned in the old one
747 assert old.table[i] is None
748 continue
749 # check characters unassigned in the old version
750 if old.table[i] is None:
751 # category 0 is "unassigned"
752 category_changes[i] = 0
753 continue
754 # check characters that differ
755 if old.table[i] != new.table[i]:
756 for k in range(len(old.table[i])):
757 if old.table[i][k] != new.table[i][k]:
758 value = old.table[i][k]
759 if k == 2:
760 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
761 category_changes[i] = CATEGORY_NAMES.index(value)
762 elif k == 4:
763 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
764 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
765 elif k == 5:
766 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
767 # We assume that all normalization changes are in 1:1 mappings
768 assert " " not in value
769 normalization_changes.append((i, value))
770 elif k == 6:
771 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
772 # we only support changes where the old value is a single digit
773 assert value in "0123456789"
774 decimal_changes[i] = int(value)
775 elif k == 8:
776 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
777 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000778 if not value:
779 numeric_changes[i] = -1
780 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000781 numeric_changes[i] = float(value)
782 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000783 elif k == 9:
784 if value == 'Y':
785 mirrored_changes[i] = '1'
786 else:
787 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788 elif k == 11:
789 # change to ISO comment, ignore
790 pass
791 elif k == 12:
792 # change to simple uppercase mapping; ignore
793 pass
794 elif k == 13:
795 # change to simple lowercase mapping; ignore
796 pass
797 elif k == 14:
798 # change to simple titlecase mapping; ignore
799 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000800 elif k == 16:
801 # derived property changes; not yet
802 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000803 else:
804 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000805 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000806 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000807 decimal_changes, mirrored_changes,
808 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000809 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000810
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000811
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000812# --------------------------------------------------------------------
813# the following support code is taken from the unidb utilities
814# Copyright (c) 1999-2000 by Secret Labs AB
815
816# load a unicode-data file from disk
817
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000818class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000819 # Record structure:
820 # [ID, name, category, combining, bidi, decomp, (6)
821 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
822 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
823 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000824
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000825 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000826 derivedprops, derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000827 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000828 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000829 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000830 while 1:
831 s = file.readline()
832 if not s:
833 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000834 s = s.strip().split(";")
835 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000836 table[char] = s
837
Martin v. Löwis97225da2002-11-24 23:05:09 +0000838 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000839 if expand:
840 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000841 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000842 s = table[i]
843 if s:
844 if s[1][-6:] == "First>":
845 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000846 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000847 elif s[1][-5:] == "Last>":
848 s[1] = ""
849 field = None
850 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000851 f2 = field[:]
852 f2[0] = "%X" % i
853 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000854
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000855 # public attributes
856 self.filename = filename
857 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000858 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000859
Martin v. Löwis677bde22002-11-23 22:08:15 +0000860 file = open(exclusions)
861 self.exclusions = {}
862 for s in file:
863 s = s.strip()
864 if not s:
865 continue
866 if s[0] == '#':
867 continue
868 char = int(s.split()[0],16)
869 self.exclusions[char] = 1
870
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000871 widths = [None] * 0x110000
872 for s in open(eastasianwidth):
873 s = s.strip()
874 if not s:
875 continue
876 if s[0] == '#':
877 continue
878 s = s.split()[0].split(';')
879 if '..' in s[0]:
880 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000881 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000882 else:
883 chars = [int(s[0], 16)]
884 for char in chars:
885 widths[char] = s[1]
886 for i in range(0, 0x110000):
887 if table[i] is not None:
888 table[i].append(widths[i])
889
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000890 for i in range(0, 0x110000):
891 if table[i] is not None:
892 table[i].append(set())
893 for s in open(derivedprops):
894 s = s.split('#', 1)[0].strip()
895 if not s:
896 continue
897
898 r, p = s.split(";")
899 r = r.strip()
900 p = p.strip()
901 if ".." in r:
902 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000903 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000904 else:
905 chars = [int(r, 16)]
906 for char in chars:
907 if table[char]:
908 # Some properties (e.g. Default_Ignorable_Code_Point)
909 # apply to unassigned code points; ignore them
910 table[char][-1].add(p)
911
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000912 if derivednormalizationprops:
913 quickchecks = [0] * 0x110000 # default is Yes
914 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
915 for s in open(derivednormalizationprops):
916 if '#' in s:
917 s = s[:s.index('#')]
918 s = [i.strip() for i in s.split(';')]
919 if len(s) < 2 or s[1] not in qc_order:
920 continue
921 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
922 quickcheck_shift = qc_order.index(s[1])*2
923 quickcheck <<= quickcheck_shift
924 if '..' not in s[0]:
925 first = last = int(s[0], 16)
926 else:
927 first, last = [int(c, 16) for c in s[0].split('..')]
928 for char in range(first, last+1):
929 assert not (quickchecks[char]>>quickcheck_shift)&3
930 quickchecks[char] |= quickcheck
931 for i in range(0, 0x110000):
932 if table[i] is not None:
933 table[i].append(quickchecks[i])
934
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000935 for line in open(unihan, encoding='utf-8'):
936 if not line.startswith('U+'):
937 continue
938 code, tag, value = line.split(None, 3)[:3]
939 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
940 'kOtherNumeric'):
941 continue
942 value = value.strip().replace(',', '')
943 i = int(code[2:], 16)
944 # Patch the numeric field
945 if table[i] is not None:
946 table[i][8] = value
947
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000948 def uselatin1(self):
949 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000950 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000951
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000952# hash table tools
953
954# this is a straight-forward reimplementation of Python's built-in
955# dictionary type, using a static data structure, and a custom string
956# hash algorithm.
957
958def myhash(s, magic):
959 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000960 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000961 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000962 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000963 if ix:
964 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
965 return h
966
967SIZES = [
968 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
969 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
970 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
971 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
972]
973
974class Hash:
975 def __init__(self, name, data, magic):
976 # turn a (key, value) list into a static hash table structure
977
978 # determine table size
979 for size, poly in SIZES:
980 if size > len(data):
981 poly = size + poly
982 break
983 else:
Collin Wintera817e582007-08-22 23:05:06 +0000984 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000985
Collin Winter6afaeb72007-08-03 17:06:41 +0000986 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000987
988 table = [None] * size
989
990 mask = size-1
991
992 n = 0
993
994 hash = myhash
995
996 # initialize hash table
997 for key, value in data:
998 h = hash(key, magic)
999 i = (~h) & mask
1000 v = table[i]
1001 if v is None:
1002 table[i] = value
1003 continue
1004 incr = (h ^ (h >> 3)) & mask;
1005 if not incr:
1006 incr = mask
1007 while 1:
1008 n = n + 1
1009 i = (i + incr) & mask
1010 v = table[i]
1011 if v is None:
1012 table[i] = value
1013 break
1014 incr = incr << 1
1015 if incr > mask:
1016 incr = incr ^ poly
1017
Collin Winter6afaeb72007-08-03 17:06:41 +00001018 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001019 self.collisions = n
1020
1021 for i in range(len(table)):
1022 if table[i] is None:
1023 table[i] = 0
1024
1025 self.data = Array(name + "_hash", table)
1026 self.magic = magic
1027 self.name = name
1028 self.size = size
1029 self.poly = poly
1030
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001031 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001032 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001033 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001034 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1035 file.write("#define %s_size %d\n" % (self.name, self.size))
1036 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1037
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001038# stuff to deal with arrays of unsigned integers
1039
1040class Array:
1041
1042 def __init__(self, name, data):
1043 self.name = name
1044 self.data = data
1045
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001046 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001047 # write data to file, as a C array
1048 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001049 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001050 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001051 file.write("static ")
1052 if size == 1:
1053 file.write("unsigned char")
1054 elif size == 2:
1055 file.write("unsigned short")
1056 else:
1057 file.write("unsigned int")
1058 file.write(" " + self.name + "[] = {\n")
1059 if self.data:
1060 s = " "
1061 for item in self.data:
1062 i = str(item) + ", "
1063 if len(s) + len(i) > 78:
1064 file.write(s + "\n")
1065 s = " " + i
1066 else:
1067 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001068 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001069 file.write(s + "\n")
1070 file.write("};\n\n")
1071
1072def getsize(data):
1073 # return smallest possible integer size for the given array
1074 maxdata = max(data)
1075 if maxdata < 256:
1076 return 1
1077 elif maxdata < 65536:
1078 return 2
1079 else:
1080 return 4
1081
Tim Peters21013482000-09-25 07:13:41 +00001082def splitbins(t, trace=0):
1083 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1084
1085 t is a sequence of ints. This function can be useful to save space if
1086 many of the ints are the same. t1 and t2 are lists of ints, and shift
1087 is an int, chosen to minimize the combined size of t1 and t2 (in C
1088 code), and where for each i in range(len(t)),
1089 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1090 where mask is a bitmask isolating the last "shift" bits.
1091
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001092 If optional arg trace is non-zero (default zero), progress info
1093 is printed to sys.stderr. The higher the value, the more info
1094 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001095 """
1096
Tim Peters21013482000-09-25 07:13:41 +00001097 if trace:
1098 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001099 print("%d+%d bins at shift %d; %d bytes" % (
1100 len(t1), len(t2), shift, bytes), file=sys.stderr)
1101 print("Size of original table:", len(t)*getsize(t), \
1102 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001103 n = len(t)-1 # last valid index
1104 maxshift = 0 # the most we can shift n and still have something left
1105 if n > 0:
1106 while n >> 1:
1107 n >>= 1
1108 maxshift += 1
1109 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001110 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001111 t = tuple(t) # so slices can be dict keys
1112 for shift in range(maxshift + 1):
1113 t1 = []
1114 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001115 size = 2**shift
1116 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001117 for i in range(0, len(t), size):
1118 bin = t[i:i+size]
1119 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001120 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001121 index = len(t2)
1122 bincache[bin] = index
1123 t2.extend(bin)
1124 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001125 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001126 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001127 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001128 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001129 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001130 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001131 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001132 t1, t2, shift = best
1133 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001134 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001135 dump(t1, t2, shift, bytes)
1136 if __debug__:
1137 # exhaustively verify that the decomposition is correct
1138 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001139 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001140 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1141 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001142
1143if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001144 maketables(1)