blob: 52cb3656e0c43f238ff52e911ffd170cdd95cf7e [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwis93cbca32008-09-10 14:08:48 +000034UNIDATA_VERSION = "5.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000038DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040
41old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000042
43CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
44 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
45 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
46 "So" ]
47
48BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
49 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
50 "ON" ]
51
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000052EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
53
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000054# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000055ALPHA_MASK = 0x01
56DECIMAL_MASK = 0x02
57DIGIT_MASK = 0x04
58LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000059LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000060SPACE_MASK = 0x20
61TITLE_MASK = 0x40
62UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000063XID_START_MASK = 0x100
64XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000065PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000066NODELTA_MASK = 0x800
Fredrik Lundhe9133f72000-09-25 17:59:57 +000067
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000068def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000069
Collin Winter6afaeb72007-08-03 17:06:41 +000070 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000071
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 version = ""
73 unicode = UnicodeData(UNICODE_DATA % version,
74 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000075 EASTASIAN_WIDTH % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000076 DERIVED_CORE_PROPERTIES % version,
77 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000078
Georg Brandl559e5d72008-06-11 18:37:52 +000079 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000080
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000082 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
84 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000085 EASTASIAN_WIDTH % ("-"+version),
86 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000087 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 merge_old_version(version, unicode, old_unicode)
89
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000090 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000091 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000092 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000093
94# --------------------------------------------------------------------
95# unicode character properties
96
97def makeunicodedata(unicode, trace):
98
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000099 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000100 table = [dummy]
101 cache = {0: dummy}
102 index = [0] * len(unicode.chars)
103
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000104 FILE = "Modules/unicodedata_db.h"
105
Collin Winter6afaeb72007-08-03 17:06:41 +0000106 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000108 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000109
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110 for char in unicode.chars:
111 record = unicode.table[char]
112 if record:
113 # extract database properties
114 category = CATEGORY_NAMES.index(record[2])
115 combining = int(record[3])
116 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
117 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000118 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000119 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000120 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000121 category, combining, bidirectional, mirrored, eastasianwidth,
122 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000123 )
124 # add entry to index and item tables
125 i = cache.get(item)
126 if i is None:
127 cache[item] = i = len(table)
128 table.append(item)
129 index[char] = i
130
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000131 # 2) decomposition data
132
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000133 decomp_data = [0]
134 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000136 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137
Martin v. Löwis677bde22002-11-23 22:08:15 +0000138 comp_pairs = []
139 comp_first = [None] * len(unicode.chars)
140 comp_last = [None] * len(unicode.chars)
141
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 for char in unicode.chars:
143 record = unicode.table[char]
144 if record:
145 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000146 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000147 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000148 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 # prefix
150 if decomp[0][0] == "<":
151 prefix = decomp.pop(0)
152 else:
153 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000154 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000155 i = decomp_prefix.index(prefix)
156 except ValueError:
157 i = len(decomp_prefix)
158 decomp_prefix.append(prefix)
159 prefix = i
160 assert prefix < 256
161 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000162 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000163 # Collect NFC pairs
164 if not prefix and len(decomp) == 3 and \
165 char not in unicode.exclusions and \
166 unicode.table[decomp[1]][3] == "0":
167 p, l, r = decomp
168 comp_first[l] = 1
169 comp_last[r] = 1
170 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 try:
172 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000173 except ValueError:
174 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175 decomp_data.extend(decomp)
176 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000177 else:
178 i = 0
179 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000180
Martin v. Löwis677bde22002-11-23 22:08:15 +0000181 f = l = 0
182 comp_first_ranges = []
183 comp_last_ranges = []
184 prev_f = prev_l = None
185 for i in unicode.chars:
186 if comp_first[i] is not None:
187 comp_first[i] = f
188 f += 1
189 if prev_f is None:
190 prev_f = (i,i)
191 elif prev_f[1]+1 == i:
192 prev_f = prev_f[0],i
193 else:
194 comp_first_ranges.append(prev_f)
195 prev_f = (i,i)
196 if comp_last[i] is not None:
197 comp_last[i] = l
198 l += 1
199 if prev_l is None:
200 prev_l = (i,i)
201 elif prev_l[1]+1 == i:
202 prev_l = prev_l[0],i
203 else:
204 comp_last_ranges.append(prev_l)
205 prev_l = (i,i)
206 comp_first_ranges.append(prev_f)
207 comp_last_ranges.append(prev_l)
208 total_first = f
209 total_last = l
210
211 comp_data = [0]*(total_first*total_last)
212 for f,l,char in comp_pairs:
213 f = comp_first[f]
214 l = comp_last[l]
215 comp_data[f*total_last+l] = char
216
Collin Winter6afaeb72007-08-03 17:06:41 +0000217 print(len(table), "unique properties")
218 print(len(decomp_prefix), "unique decomposition prefixes")
219 print(len(decomp_data), "unique decomposition entries:", end=' ')
220 print(decomp_size, "bytes")
221 print(total_first, "first characters in NFC")
222 print(total_last, "last characters in NFC")
223 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000224
Collin Winter6afaeb72007-08-03 17:06:41 +0000225 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000226
Fred Drake9c685052000-10-26 03:56:46 +0000227 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000228 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
229 print(file=fp)
230 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
231 print("/* a list of unique database records */", file=fp)
232 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000233 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000234 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000235 print("};", file=fp)
236 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237
Collin Winter6afaeb72007-08-03 17:06:41 +0000238 print("/* Reindexing of NFC first characters. */", file=fp)
239 print("#define TOTAL_FIRST",total_first, file=fp)
240 print("#define TOTAL_LAST",total_last, file=fp)
241 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000242 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000243 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000244 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
245 print(" {0,0,0}", file=fp)
246 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000247 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000248 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000249 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
250 print(" {0,0,0}", file=fp)
251 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000252
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000253 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000254 # the support code moved into unicodedatabase.c
255
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print("/* string literals */", file=fp)
257 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000258 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print(" \"%s\"," % name, file=fp)
260 print(" NULL", file=fp)
261 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000264 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000265 print(" \"%s\"," % name, file=fp)
266 print(" NULL", file=fp)
267 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000270 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print(" \"%s\"," % name, file=fp)
272 print(" NULL", file=fp)
273 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000274
Collin Winter6afaeb72007-08-03 17:06:41 +0000275 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000276 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print(" \"%s\"," % name, file=fp)
278 print(" NULL", file=fp)
279 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000280
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000281 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000282 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000283
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print("/* index tables for the database records */", file=fp)
285 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000286 Array("index1", index1).dump(fp, trace)
287 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000288
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000289 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000290 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000293 Array("decomp_data", decomp_data).dump(fp, trace)
294
Collin Winter6afaeb72007-08-03 17:06:41 +0000295 print("/* index tables for the decomposition data */", file=fp)
296 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000297 Array("decomp_index1", index1).dump(fp, trace)
298 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000299
Martin v. Löwis677bde22002-11-23 22:08:15 +0000300 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000301 print("/* NFC pairs */", file=fp)
302 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000303 Array("comp_index", index).dump(fp, trace)
304 Array("comp_data", index2).dump(fp, trace)
305
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000306 # Generate delta tables for old versions
307 for version, table, normalization in unicode.changed:
308 cversion = version.replace(".","_")
309 records = [table[0]]
310 cache = {table[0]:0}
311 index = [0] * len(table)
312 for i, record in enumerate(table):
313 try:
314 index[i] = cache[record]
315 except KeyError:
316 index[i] = cache[record] = len(records)
317 records.append(record)
318 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000319 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000320 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
322 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000323 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
324 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
326 print("{", file=fp)
327 print("\tint index;", file=fp)
328 print("\tif (n >= 0x110000) index = 0;", file=fp)
329 print("\telse {", file=fp)
330 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
331 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
332 (cversion, shift, ((1<<shift)-1)), file=fp)
333 print("\t}", file=fp)
334 print("\treturn change_records_%s+index;" % cversion, file=fp)
335 print("}\n", file=fp)
336 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
337 print("{", file=fp)
338 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000339 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000340 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
341 print("\tdefault: return 0;", file=fp)
342 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000344 fp.close()
345
346# --------------------------------------------------------------------
347# unicode character type tables
348
349def makeunicodetype(unicode, trace):
350
351 FILE = "Objects/unicodetype_db.h"
352
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000354
355 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000356 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000357 table = [dummy]
358 cache = {0: dummy}
359 index = [0] * len(unicode.chars)
360
361 for char in unicode.chars:
362 record = unicode.table[char]
363 if record:
364 # extract database properties
365 category = record[2]
366 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000367 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000368 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000369 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
371 flags |= ALPHA_MASK
372 if category == "Ll":
373 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000374 if category == "Zl" or bidirectional == "B":
375 flags |= LINEBREAK_MASK
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000376 if category == "Zs" or bidirectional in ("WS", "B", "S"):
377 flags |= SPACE_MASK
Fredrik Lundh375732c2000-09-25 23:03:34 +0000378 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 flags |= TITLE_MASK
380 if category == "Lu":
381 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000382 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000383 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000384 if "XID_Start" in properties:
385 flags |= XID_START_MASK
386 if "XID_Continue" in properties:
387 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000388 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000390 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000392 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000394 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000395 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000396 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000398 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000399 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000400 # UCD.html says that a missing title char means that
401 # it defaults to the uppercase character, not to the
402 # character itself. Apparently, in the current UCD (5.x)
403 # this feature is never used
404 title = upper
405 upper_d = upper - char
406 lower_d = lower - char
407 title_d = title - char
408 if -32768 <= upper_d <= 32767 and \
409 -32768 <= lower_d <= 32767 and \
410 -32768 <= title_d <= 32767:
411 # use deltas
412 upper = upper_d & 0xffff
413 lower = lower_d & 0xffff
414 title = title_d & 0xffff
415 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000416 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000417 # decimal digit, integer digit
418 decimal = 0
419 if record[6]:
420 flags |= DECIMAL_MASK
421 decimal = int(record[6])
422 digit = 0
423 if record[7]:
424 flags |= DIGIT_MASK
425 digit = int(record[7])
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000426 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000427 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000428 )
429 # add entry to index and item tables
430 i = cache.get(item)
431 if i is None:
432 cache[item] = i = len(table)
433 table.append(item)
434 index[char] = i
435
Collin Winter6afaeb72007-08-03 17:06:41 +0000436 print(len(table), "unique character type entries")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000437
Collin Winter6afaeb72007-08-03 17:06:41 +0000438 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000439
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000440 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000441 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
442 print(file=fp)
443 print("/* a list of unique character type descriptors */", file=fp)
444 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000445 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000446 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
447 print("};", file=fp)
448 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000449
450 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000451 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
Collin Winter6afaeb72007-08-03 17:06:41 +0000453 print("/* type indexes */", file=fp)
454 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 Array("index1", index1).dump(fp, trace)
456 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000457
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458 fp.close()
459
460# --------------------------------------------------------------------
461# unicode name database
462
463def makeunicodename(unicode, trace):
464
465 FILE = "Modules/unicodename_db.h"
466
Collin Winter6afaeb72007-08-03 17:06:41 +0000467 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000468
469 # collect names
470 names = [None] * len(unicode.chars)
471
472 for char in unicode.chars:
473 record = unicode.table[char]
474 if record:
475 name = record[1].strip()
476 if name and name[0] != "<":
477 names[char] = name + chr(0)
478
Georg Brandl559e5d72008-06-11 18:37:52 +0000479 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000480
481 # collect unique words from names (note that we differ between
482 # words inside a sentence, and words ending a sentence. the
483 # latter includes the trailing null byte.
484
485 words = {}
486 n = b = 0
487 for char in unicode.chars:
488 name = names[char]
489 if name:
490 w = name.split()
491 b = b + len(name)
492 n = n + len(w)
493 for w in w:
494 l = words.get(w)
495 if l:
496 l.append(None)
497 else:
498 words[w] = [len(words)]
499
Collin Winter6afaeb72007-08-03 17:06:41 +0000500 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000501
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000502 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503
Martin v. Löwis97225da2002-11-24 23:05:09 +0000504 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000505 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000506 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000507 return -len(alist), aword
508 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000509
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000510 # figure out how many phrasebook escapes we need
511 escapes = 0
512 while escapes * 256 < len(wordlist):
513 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000514 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000515
516 short = 256 - escapes
517
518 assert short > 0
519
Collin Winter6afaeb72007-08-03 17:06:41 +0000520 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000521
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000522 # statistics
523 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000524 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000525 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000526 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000527
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000528 # pick the most commonly used words, and sort the rest on falling
529 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000530
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000531 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000532 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000533 wordlist.extend(wordtail)
534
535 # generate lexicon from words
536
537 lexicon_offset = [0]
538 lexicon = ""
539 words = {}
540
541 # build a lexicon string
542 offset = 0
543 for w, x in wordlist:
544 # encoding: bit 7 indicates last character in word (chr(128)
545 # indicates the last character in an entire string)
546 ww = w[:-1] + chr(ord(w[-1])+128)
547 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000548 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000549 if o < 0:
550 o = offset
551 lexicon = lexicon + ww
552 offset = offset + len(w)
553 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000554 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000555
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000556 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000557
558 # generate phrasebook from names and lexicon
559 phrasebook = [0]
560 phrasebook_offset = [0] * len(unicode.chars)
561 for char in unicode.chars:
562 name = names[char]
563 if name:
564 w = name.split()
565 phrasebook_offset[char] = len(phrasebook)
566 for w in w:
567 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000568 if i < short:
569 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000570 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000571 # store as two bytes
572 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000573 phrasebook.append(i&255)
574
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000575 assert getsize(phrasebook) == 1
576
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000577 #
578 # unicode name hash table
579
580 # extract names
581 data = []
582 for char in unicode.chars:
583 record = unicode.table[char]
584 if record:
585 name = record[1].strip()
586 if name and name[0] != "<":
587 data.append((name, char))
588
589 # the magic number 47 was chosen to minimize the number of
590 # collisions on the current data set. if you like, change it
591 # and see what happens...
592
593 codehash = Hash("code", data, 47)
594
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000598 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
599 print(file=fp)
600 print("#define NAME_MAXLEN", 256, file=fp)
601 print(file=fp)
602 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000603 Array("lexicon", lexicon).dump(fp, trace)
604 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
606 # split decomposition index table
607 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
608
Collin Winter6afaeb72007-08-03 17:06:41 +0000609 print("/* code->name phrasebook */", file=fp)
610 print("#define phrasebook_shift", shift, file=fp)
611 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000612
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000613 Array("phrasebook", phrasebook).dump(fp, trace)
614 Array("phrasebook_offset1", offset1).dump(fp, trace)
615 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000616
Collin Winter6afaeb72007-08-03 17:06:41 +0000617 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000618 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000619
620 fp.close()
621
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000622
623def merge_old_version(version, new, old):
624 # Changes to exclusion file not implemented yet
625 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000626 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000627
628 # In these change records, 0xFF means "no change"
629 bidir_changes = [0xFF]*0x110000
630 category_changes = [0xFF]*0x110000
631 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000632 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000633 # In numeric data, 0 means "no change",
634 # -1 means "did not have a numeric value
635 numeric_changes = [0] * 0x110000
636 # normalization_changes is a list of key-value pairs
637 normalization_changes = []
638 for i in range(0x110000):
639 if new.table[i] is None:
640 # Characters unassigned in the new version ought to
641 # be unassigned in the old one
642 assert old.table[i] is None
643 continue
644 # check characters unassigned in the old version
645 if old.table[i] is None:
646 # category 0 is "unassigned"
647 category_changes[i] = 0
648 continue
649 # check characters that differ
650 if old.table[i] != new.table[i]:
651 for k in range(len(old.table[i])):
652 if old.table[i][k] != new.table[i][k]:
653 value = old.table[i][k]
654 if k == 2:
655 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
656 category_changes[i] = CATEGORY_NAMES.index(value)
657 elif k == 4:
658 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
659 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
660 elif k == 5:
661 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
662 # We assume that all normalization changes are in 1:1 mappings
663 assert " " not in value
664 normalization_changes.append((i, value))
665 elif k == 6:
666 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
667 # we only support changes where the old value is a single digit
668 assert value in "0123456789"
669 decimal_changes[i] = int(value)
670 elif k == 8:
671 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
672 # Since 0 encodes "no change", the old value is better not 0
673 assert value != "0" and value != "-1"
674 if not value:
675 numeric_changes[i] = -1
676 else:
677 assert re.match("^[0-9]+$", value)
678 numeric_changes[i] = int(value)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000679 elif k == 9:
680 if value == 'Y':
681 mirrored_changes[i] = '1'
682 else:
683 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000684 elif k == 11:
685 # change to ISO comment, ignore
686 pass
687 elif k == 12:
688 # change to simple uppercase mapping; ignore
689 pass
690 elif k == 13:
691 # change to simple lowercase mapping; ignore
692 pass
693 elif k == 14:
694 # change to simple titlecase mapping; ignore
695 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000696 elif k == 16:
697 # derived property changes; not yet
698 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000699 else:
700 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000701 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000702 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000703 decimal_changes, mirrored_changes,
704 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000705 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000706
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000707
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000708# --------------------------------------------------------------------
709# the following support code is taken from the unidb utilities
710# Copyright (c) 1999-2000 by Secret Labs AB
711
712# load a unicode-data file from disk
713
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000714import sys
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000715
716class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000717 # Record structure:
718 # [ID, name, category, combining, bidi, decomp, (6)
719 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
720 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
721 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000722
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000723 def __init__(self, filename, exclusions, eastasianwidth,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000724 derivedprops, derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000725 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000726 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000727 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000728 while 1:
729 s = file.readline()
730 if not s:
731 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000732 s = s.strip().split(";")
733 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000734 table[char] = s
735
Martin v. Löwis97225da2002-11-24 23:05:09 +0000736 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000737 if expand:
738 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000739 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000740 s = table[i]
741 if s:
742 if s[1][-6:] == "First>":
743 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000744 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000745 elif s[1][-5:] == "Last>":
746 s[1] = ""
747 field = None
748 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000749 f2 = field[:]
750 f2[0] = "%X" % i
751 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000752
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000753 # public attributes
754 self.filename = filename
755 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000756 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000757
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 file = open(exclusions)
759 self.exclusions = {}
760 for s in file:
761 s = s.strip()
762 if not s:
763 continue
764 if s[0] == '#':
765 continue
766 char = int(s.split()[0],16)
767 self.exclusions[char] = 1
768
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000769 widths = [None] * 0x110000
770 for s in open(eastasianwidth):
771 s = s.strip()
772 if not s:
773 continue
774 if s[0] == '#':
775 continue
776 s = s.split()[0].split(';')
777 if '..' in s[0]:
778 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000779 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000780 else:
781 chars = [int(s[0], 16)]
782 for char in chars:
783 widths[char] = s[1]
784 for i in range(0, 0x110000):
785 if table[i] is not None:
786 table[i].append(widths[i])
787
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000788 for i in range(0, 0x110000):
789 if table[i] is not None:
790 table[i].append(set())
791 for s in open(derivedprops):
792 s = s.split('#', 1)[0].strip()
793 if not s:
794 continue
795
796 r, p = s.split(";")
797 r = r.strip()
798 p = p.strip()
799 if ".." in r:
800 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000801 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000802 else:
803 chars = [int(r, 16)]
804 for char in chars:
805 if table[char]:
806 # Some properties (e.g. Default_Ignorable_Code_Point)
807 # apply to unassigned code points; ignore them
808 table[char][-1].add(p)
809
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000810 if derivednormalizationprops:
811 quickchecks = [0] * 0x110000 # default is Yes
812 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
813 for s in open(derivednormalizationprops):
814 if '#' in s:
815 s = s[:s.index('#')]
816 s = [i.strip() for i in s.split(';')]
817 if len(s) < 2 or s[1] not in qc_order:
818 continue
819 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
820 quickcheck_shift = qc_order.index(s[1])*2
821 quickcheck <<= quickcheck_shift
822 if '..' not in s[0]:
823 first = last = int(s[0], 16)
824 else:
825 first, last = [int(c, 16) for c in s[0].split('..')]
826 for char in range(first, last+1):
827 assert not (quickchecks[char]>>quickcheck_shift)&3
828 quickchecks[char] |= quickcheck
829 for i in range(0, 0x110000):
830 if table[i] is not None:
831 table[i].append(quickchecks[i])
832
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000833 def uselatin1(self):
834 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000835 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000836
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000837# hash table tools
838
839# this is a straight-forward reimplementation of Python's built-in
840# dictionary type, using a static data structure, and a custom string
841# hash algorithm.
842
843def myhash(s, magic):
844 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000845 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000846 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000847 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000848 if ix:
849 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
850 return h
851
852SIZES = [
853 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
854 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
855 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
856 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
857]
858
859class Hash:
860 def __init__(self, name, data, magic):
861 # turn a (key, value) list into a static hash table structure
862
863 # determine table size
864 for size, poly in SIZES:
865 if size > len(data):
866 poly = size + poly
867 break
868 else:
Collin Wintera817e582007-08-22 23:05:06 +0000869 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000870
Collin Winter6afaeb72007-08-03 17:06:41 +0000871 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000872
873 table = [None] * size
874
875 mask = size-1
876
877 n = 0
878
879 hash = myhash
880
881 # initialize hash table
882 for key, value in data:
883 h = hash(key, magic)
884 i = (~h) & mask
885 v = table[i]
886 if v is None:
887 table[i] = value
888 continue
889 incr = (h ^ (h >> 3)) & mask;
890 if not incr:
891 incr = mask
892 while 1:
893 n = n + 1
894 i = (i + incr) & mask
895 v = table[i]
896 if v is None:
897 table[i] = value
898 break
899 incr = incr << 1
900 if incr > mask:
901 incr = incr ^ poly
902
Collin Winter6afaeb72007-08-03 17:06:41 +0000903 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000904 self.collisions = n
905
906 for i in range(len(table)):
907 if table[i] is None:
908 table[i] = 0
909
910 self.data = Array(name + "_hash", table)
911 self.magic = magic
912 self.name = name
913 self.size = size
914 self.poly = poly
915
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000916 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000917 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000918 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000919 file.write("#define %s_magic %d\n" % (self.name, self.magic))
920 file.write("#define %s_size %d\n" % (self.name, self.size))
921 file.write("#define %s_poly %d\n" % (self.name, self.poly))
922
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923# stuff to deal with arrays of unsigned integers
924
925class Array:
926
927 def __init__(self, name, data):
928 self.name = name
929 self.data = data
930
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000931 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000932 # write data to file, as a C array
933 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000934 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +0000935 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000936 file.write("static ")
937 if size == 1:
938 file.write("unsigned char")
939 elif size == 2:
940 file.write("unsigned short")
941 else:
942 file.write("unsigned int")
943 file.write(" " + self.name + "[] = {\n")
944 if self.data:
945 s = " "
946 for item in self.data:
947 i = str(item) + ", "
948 if len(s) + len(i) > 78:
949 file.write(s + "\n")
950 s = " " + i
951 else:
952 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000953 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000954 file.write(s + "\n")
955 file.write("};\n\n")
956
957def getsize(data):
958 # return smallest possible integer size for the given array
959 maxdata = max(data)
960 if maxdata < 256:
961 return 1
962 elif maxdata < 65536:
963 return 2
964 else:
965 return 4
966
Tim Peters21013482000-09-25 07:13:41 +0000967def splitbins(t, trace=0):
968 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
969
970 t is a sequence of ints. This function can be useful to save space if
971 many of the ints are the same. t1 and t2 are lists of ints, and shift
972 is an int, chosen to minimize the combined size of t1 and t2 (in C
973 code), and where for each i in range(len(t)),
974 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
975 where mask is a bitmask isolating the last "shift" bits.
976
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000977 If optional arg trace is non-zero (default zero), progress info
978 is printed to sys.stderr. The higher the value, the more info
979 you'll get.
Tim Peters21013482000-09-25 07:13:41 +0000980 """
981
982 import sys
983 if trace:
984 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +0000985 print("%d+%d bins at shift %d; %d bytes" % (
986 len(t1), len(t2), shift, bytes), file=sys.stderr)
987 print("Size of original table:", len(t)*getsize(t), \
988 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +0000989 n = len(t)-1 # last valid index
990 maxshift = 0 # the most we can shift n and still have something left
991 if n > 0:
992 while n >> 1:
993 n >>= 1
994 maxshift += 1
995 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +0000996 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +0000997 t = tuple(t) # so slices can be dict keys
998 for shift in range(maxshift + 1):
999 t1 = []
1000 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001001 size = 2**shift
1002 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001003 for i in range(0, len(t), size):
1004 bin = t[i:i+size]
1005 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001006 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001007 index = len(t2)
1008 bincache[bin] = index
1009 t2.extend(bin)
1010 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001011 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001012 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001013 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001014 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001015 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001016 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001017 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001018 t1, t2, shift = best
1019 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001020 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001021 dump(t1, t2, shift, bytes)
1022 if __debug__:
1023 # exhaustively verify that the decomposition is correct
1024 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001025 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001026 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1027 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001028
1029if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001030 maketables(1)