blob: 4eda1b98cdec07d829237757e01319fe55edbac5 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Florent Xiclunafaa663f2010-03-19 13:37:08 +000034UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041
42old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000043
44CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
45 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
46 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
47 "So" ]
48
49BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
50 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
51 "ON" ]
52
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000053EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
54
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000055# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000056ALPHA_MASK = 0x01
57DECIMAL_MASK = 0x02
58DIGIT_MASK = 0x04
59LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000060LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000061SPACE_MASK = 0x20
62TITLE_MASK = 0x40
63UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000064XID_START_MASK = 0x100
65XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000066PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000067NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000068NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000069
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000070def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000071
Collin Winter6afaeb72007-08-03 17:06:41 +000072 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000073
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074 version = ""
75 unicode = UnicodeData(UNICODE_DATA % version,
76 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000077 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000078 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000079 DERIVED_CORE_PROPERTIES % version,
80 DERIVEDNORMALIZATION_PROPS % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000081
Georg Brandl559e5d72008-06-11 18:37:52 +000082 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000083
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000085 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
87 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000088 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000089 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000090 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000091 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000092 merge_old_version(version, unicode, old_unicode)
93
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000094 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000095 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000096 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000097
98# --------------------------------------------------------------------
99# unicode character properties
100
101def makeunicodedata(unicode, trace):
102
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000103 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104 table = [dummy]
105 cache = {0: dummy}
106 index = [0] * len(unicode.chars)
107
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000108 FILE = "Modules/unicodedata_db.h"
109
Collin Winter6afaeb72007-08-03 17:06:41 +0000110 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000112 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000113
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114 for char in unicode.chars:
115 record = unicode.table[char]
116 if record:
117 # extract database properties
118 category = CATEGORY_NAMES.index(record[2])
119 combining = int(record[3])
120 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
121 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000122 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000123 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000124 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000125 category, combining, bidirectional, mirrored, eastasianwidth,
126 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127 )
128 # add entry to index and item tables
129 i = cache.get(item)
130 if i is None:
131 cache[item] = i = len(table)
132 table.append(item)
133 index[char] = i
134
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135 # 2) decomposition data
136
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000137 decomp_data = [0]
138 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141
Martin v. Löwis677bde22002-11-23 22:08:15 +0000142 comp_pairs = []
143 comp_first = [None] * len(unicode.chars)
144 comp_last = [None] * len(unicode.chars)
145
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000146 for char in unicode.chars:
147 record = unicode.table[char]
148 if record:
149 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000150 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000151 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000152 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 # prefix
154 if decomp[0][0] == "<":
155 prefix = decomp.pop(0)
156 else:
157 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000158 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000159 i = decomp_prefix.index(prefix)
160 except ValueError:
161 i = len(decomp_prefix)
162 decomp_prefix.append(prefix)
163 prefix = i
164 assert prefix < 256
165 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000166 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 # Collect NFC pairs
168 if not prefix and len(decomp) == 3 and \
169 char not in unicode.exclusions and \
170 unicode.table[decomp[1]][3] == "0":
171 p, l, r = decomp
172 comp_first[l] = 1
173 comp_last[r] = 1
174 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000175 try:
176 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000177 except ValueError:
178 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 decomp_data.extend(decomp)
180 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 else:
182 i = 0
183 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000184
Martin v. Löwis677bde22002-11-23 22:08:15 +0000185 f = l = 0
186 comp_first_ranges = []
187 comp_last_ranges = []
188 prev_f = prev_l = None
189 for i in unicode.chars:
190 if comp_first[i] is not None:
191 comp_first[i] = f
192 f += 1
193 if prev_f is None:
194 prev_f = (i,i)
195 elif prev_f[1]+1 == i:
196 prev_f = prev_f[0],i
197 else:
198 comp_first_ranges.append(prev_f)
199 prev_f = (i,i)
200 if comp_last[i] is not None:
201 comp_last[i] = l
202 l += 1
203 if prev_l is None:
204 prev_l = (i,i)
205 elif prev_l[1]+1 == i:
206 prev_l = prev_l[0],i
207 else:
208 comp_last_ranges.append(prev_l)
209 prev_l = (i,i)
210 comp_first_ranges.append(prev_f)
211 comp_last_ranges.append(prev_l)
212 total_first = f
213 total_last = l
214
215 comp_data = [0]*(total_first*total_last)
216 for f,l,char in comp_pairs:
217 f = comp_first[f]
218 l = comp_last[l]
219 comp_data[f*total_last+l] = char
220
Collin Winter6afaeb72007-08-03 17:06:41 +0000221 print(len(table), "unique properties")
222 print(len(decomp_prefix), "unique decomposition prefixes")
223 print(len(decomp_data), "unique decomposition entries:", end=' ')
224 print(decomp_size, "bytes")
225 print(total_first, "first characters in NFC")
226 print(total_last, "last characters in NFC")
227 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000228
Collin Winter6afaeb72007-08-03 17:06:41 +0000229 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000230
Fred Drake9c685052000-10-26 03:56:46 +0000231 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
233 print(file=fp)
234 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
235 print("/* a list of unique database records */", file=fp)
236 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000237 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000238 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000239 print("};", file=fp)
240 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print("/* Reindexing of NFC first characters. */", file=fp)
243 print("#define TOTAL_FIRST",total_first, file=fp)
244 print("#define TOTAL_LAST",total_last, file=fp)
245 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000246 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000247 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000248 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
249 print(" {0,0,0}", file=fp)
250 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000251 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000252 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
254 print(" {0,0,0}", file=fp)
255 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000257 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000258 # the support code moved into unicodedatabase.c
259
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print("/* string literals */", file=fp)
261 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print(" \"%s\"," % name, file=fp)
264 print(" NULL", file=fp)
265 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print(" \"%s\"," % name, file=fp)
270 print(" NULL", file=fp)
271 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000274 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000275 print(" \"%s\"," % name, file=fp)
276 print(" NULL", file=fp)
277 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000281 print(" \"%s\"," % name, file=fp)
282 print(" NULL", file=fp)
283 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000284
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000285 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000286 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* index tables for the database records */", file=fp)
289 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 Array("index1", index1).dump(fp, trace)
291 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000293 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000294 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000295
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000297 Array("decomp_data", decomp_data).dump(fp, trace)
298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* index tables for the decomposition data */", file=fp)
300 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_index1", index1).dump(fp, trace)
302 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000303
Martin v. Löwis677bde22002-11-23 22:08:15 +0000304 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print("/* NFC pairs */", file=fp)
306 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000307 Array("comp_index", index).dump(fp, trace)
308 Array("comp_data", index2).dump(fp, trace)
309
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000310 # Generate delta tables for old versions
311 for version, table, normalization in unicode.changed:
312 cversion = version.replace(".","_")
313 records = [table[0]]
314 cache = {table[0]:0}
315 index = [0] * len(table)
316 for i, record in enumerate(table):
317 try:
318 index[i] = cache[record]
319 except KeyError:
320 index[i] = cache[record] = len(records)
321 records.append(record)
322 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
326 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
328 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
330 print("{", file=fp)
331 print("\tint index;", file=fp)
332 print("\tif (n >= 0x110000) index = 0;", file=fp)
333 print("\telse {", file=fp)
334 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
335 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
336 (cversion, shift, ((1<<shift)-1)), file=fp)
337 print("\t}", file=fp)
338 print("\treturn change_records_%s+index;" % cversion, file=fp)
339 print("}\n", file=fp)
340 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
341 print("{", file=fp)
342 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000344 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
345 print("\tdefault: return 0;", file=fp)
346 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000348 fp.close()
349
350# --------------------------------------------------------------------
351# unicode character type tables
352
353def makeunicodetype(unicode, trace):
354
355 FILE = "Objects/unicodetype_db.h"
356
Collin Winter6afaeb72007-08-03 17:06:41 +0000357 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000358
359 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000360 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361 table = [dummy]
362 cache = {0: dummy}
363 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000364 numeric = {}
365 spaces = []
366 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000367
368 for char in unicode.chars:
369 record = unicode.table[char]
370 if record:
371 # extract database properties
372 category = record[2]
373 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000374 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000375 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000376 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000377 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
378 flags |= ALPHA_MASK
379 if category == "Ll":
380 flags |= LOWER_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000381 if category == "Zl" or bidirectional == "B":
382 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000383 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000384 if category == "Zs" or bidirectional in ("WS", "B", "S"):
385 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000386 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000387 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 flags |= TITLE_MASK
389 if category == "Lu":
390 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000391 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000392 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000393 if "XID_Start" in properties:
394 flags |= XID_START_MASK
395 if "XID_Continue" in properties:
396 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000397 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000399 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000401 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 # UCD.html says that a missing title char means that
410 # it defaults to the uppercase character, not to the
411 # character itself. Apparently, in the current UCD (5.x)
412 # this feature is never used
413 title = upper
414 upper_d = upper - char
415 lower_d = lower - char
416 title_d = title - char
417 if -32768 <= upper_d <= 32767 and \
418 -32768 <= lower_d <= 32767 and \
419 -32768 <= title_d <= 32767:
420 # use deltas
421 upper = upper_d & 0xffff
422 lower = lower_d & 0xffff
423 title = title_d & 0xffff
424 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000425 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000426 # decimal digit, integer digit
427 decimal = 0
428 if record[6]:
429 flags |= DECIMAL_MASK
430 decimal = int(record[6])
431 digit = 0
432 if record[7]:
433 flags |= DIGIT_MASK
434 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000435 if record[8]:
436 flags |= NUMERIC_MASK
437 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000438 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000439 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000440 )
441 # add entry to index and item tables
442 i = cache.get(item)
443 if i is None:
444 cache[item] = i = len(table)
445 table.append(item)
446 index[char] = i
447
Collin Winter6afaeb72007-08-03 17:06:41 +0000448 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000449 print(sum(map(len, numeric.values())), "numeric code points")
450 print(len(spaces), "whitespace code points")
451 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452
Collin Winter6afaeb72007-08-03 17:06:41 +0000453 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000454
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000455 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
457 print(file=fp)
458 print("/* a list of unique character type descriptors */", file=fp)
459 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000460 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000461 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
462 print("};", file=fp)
463 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464
465 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000466 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000467
Collin Winter6afaeb72007-08-03 17:06:41 +0000468 print("/* type indexes */", file=fp)
469 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000470 Array("index1", index1).dump(fp, trace)
471 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000472
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000473 # Generate code for _PyUnicode_ToNumeric()
474 numeric_items = sorted(numeric.items())
475 print('/* Returns the numeric value as double for Unicode characters', file=fp)
476 print(' * having this property, -1.0 otherwise.', file=fp)
477 print(' */', file=fp)
478 print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
479 print('{', file=fp)
480 print(' switch (ch) {', file=fp)
481 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000482 # Turn text into float literals
483 parts = value.split('/')
484 parts = [repr(float(part)) for part in parts]
485 value = '/'.join(parts)
486
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000487 haswide = False
488 hasnonewide = False
489 codepoints.sort()
490 for codepoint in codepoints:
491 if codepoint < 0x10000:
492 hasnonewide = True
493 if codepoint >= 0x10000 and not haswide:
494 print('#ifdef Py_UNICODE_WIDE', file=fp)
495 haswide = True
496 print(' case 0x%04X:' % (codepoint,), file=fp)
497 if haswide and hasnonewide:
498 print('#endif', file=fp)
499 print(' return (double) %s;' % (value,), file=fp)
500 if haswide and not hasnonewide:
501 print('#endif', file=fp)
502 print(' }', file=fp)
503 print(' return -1.0;', file=fp)
504 print('}', file=fp)
505 print(file=fp)
506
507 # Generate code for _PyUnicode_IsWhitespace()
508 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
509 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
510 print(" */", file=fp)
511 print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
512 print('{', file=fp)
513 print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
514 print(' return iswspace(ch);', file=fp)
515 print('#else', file=fp)
516 print(' switch (ch) {', file=fp)
517
518 haswide = False
519 hasnonewide = False
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000520 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000521 if codepoint < 0x10000:
522 hasnonewide = True
523 if codepoint >= 0x10000 and not haswide:
524 print('#ifdef Py_UNICODE_WIDE', file=fp)
525 haswide = True
526 print(' case 0x%04X:' % (codepoint,), file=fp)
527 if haswide and hasnonewide:
528 print('#endif', file=fp)
529 print(' return 1;', file=fp)
530 if haswide and not hasnonewide:
531 print('#endif', file=fp)
532
533 print(' }', file=fp)
534 print(' return 0;', file=fp)
535 print('#endif', file=fp)
536 print('}', file=fp)
537 print(file=fp)
538
539 # Generate code for _PyUnicode_IsLinebreak()
540 print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
541 print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
542 print(" */", file=fp)
543 print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
544 print('{', file=fp)
545 print(' switch (ch) {', file=fp)
546 haswide = False
547 hasnonewide = False
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000548 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000549 if codepoint < 0x10000:
550 hasnonewide = True
551 if codepoint >= 0x10000 and not haswide:
552 print('#ifdef Py_UNICODE_WIDE', file=fp)
553 haswide = True
554 print(' case 0x%04X:' % (codepoint,), file=fp)
555 if haswide and hasnonewide:
556 print('#endif', file=fp)
557 print(' return 1;', file=fp)
558 if haswide and not hasnonewide:
559 print('#endif', file=fp)
560
561 print(' }', file=fp)
562 print(' return 0;', file=fp)
563 print('}', file=fp)
564 print(file=fp)
565
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000566 fp.close()
567
568# --------------------------------------------------------------------
569# unicode name database
570
571def makeunicodename(unicode, trace):
572
573 FILE = "Modules/unicodename_db.h"
574
Collin Winter6afaeb72007-08-03 17:06:41 +0000575 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000576
577 # collect names
578 names = [None] * len(unicode.chars)
579
580 for char in unicode.chars:
581 record = unicode.table[char]
582 if record:
583 name = record[1].strip()
584 if name and name[0] != "<":
585 names[char] = name + chr(0)
586
Georg Brandl559e5d72008-06-11 18:37:52 +0000587 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000588
589 # collect unique words from names (note that we differ between
590 # words inside a sentence, and words ending a sentence. the
591 # latter includes the trailing null byte.
592
593 words = {}
594 n = b = 0
595 for char in unicode.chars:
596 name = names[char]
597 if name:
598 w = name.split()
599 b = b + len(name)
600 n = n + len(w)
601 for w in w:
602 l = words.get(w)
603 if l:
604 l.append(None)
605 else:
606 words[w] = [len(words)]
607
Collin Winter6afaeb72007-08-03 17:06:41 +0000608 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000609
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000610 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000611
Martin v. Löwis97225da2002-11-24 23:05:09 +0000612 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000613 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000614 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000615 return -len(alist), aword
616 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000617
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000618 # figure out how many phrasebook escapes we need
619 escapes = 0
620 while escapes * 256 < len(wordlist):
621 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000622 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000623
624 short = 256 - escapes
625
626 assert short > 0
627
Collin Winter6afaeb72007-08-03 17:06:41 +0000628 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630 # statistics
631 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000632 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000633 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000634 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000636 # pick the most commonly used words, and sort the rest on falling
637 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000639 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000640 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000641 wordlist.extend(wordtail)
642
643 # generate lexicon from words
644
645 lexicon_offset = [0]
646 lexicon = ""
647 words = {}
648
649 # build a lexicon string
650 offset = 0
651 for w, x in wordlist:
652 # encoding: bit 7 indicates last character in word (chr(128)
653 # indicates the last character in an entire string)
654 ww = w[:-1] + chr(ord(w[-1])+128)
655 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000656 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000657 if o < 0:
658 o = offset
659 lexicon = lexicon + ww
660 offset = offset + len(w)
661 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000662 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000663
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000664 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665
666 # generate phrasebook from names and lexicon
667 phrasebook = [0]
668 phrasebook_offset = [0] * len(unicode.chars)
669 for char in unicode.chars:
670 name = names[char]
671 if name:
672 w = name.split()
673 phrasebook_offset[char] = len(phrasebook)
674 for w in w:
675 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000676 if i < short:
677 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000679 # store as two bytes
680 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000681 phrasebook.append(i&255)
682
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000683 assert getsize(phrasebook) == 1
684
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685 #
686 # unicode name hash table
687
688 # extract names
689 data = []
690 for char in unicode.chars:
691 record = unicode.table[char]
692 if record:
693 name = record[1].strip()
694 if name and name[0] != "<":
695 data.append((name, char))
696
697 # the magic number 47 was chosen to minimize the number of
698 # collisions on the current data set. if you like, change it
699 # and see what happens...
700
701 codehash = Hash("code", data, 47)
702
Collin Winter6afaeb72007-08-03 17:06:41 +0000703 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000704
705 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000706 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
707 print(file=fp)
708 print("#define NAME_MAXLEN", 256, file=fp)
709 print(file=fp)
710 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000711 Array("lexicon", lexicon).dump(fp, trace)
712 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000713
714 # split decomposition index table
715 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
716
Collin Winter6afaeb72007-08-03 17:06:41 +0000717 print("/* code->name phrasebook */", file=fp)
718 print("#define phrasebook_shift", shift, file=fp)
719 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000720
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000721 Array("phrasebook", phrasebook).dump(fp, trace)
722 Array("phrasebook_offset1", offset1).dump(fp, trace)
723 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724
Collin Winter6afaeb72007-08-03 17:06:41 +0000725 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000726 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000727
728 fp.close()
729
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000730
731def merge_old_version(version, new, old):
732 # Changes to exclusion file not implemented yet
733 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000734 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735
736 # In these change records, 0xFF means "no change"
737 bidir_changes = [0xFF]*0x110000
738 category_changes = [0xFF]*0x110000
739 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000740 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000741 # In numeric data, 0 means "no change",
742 # -1 means "did not have a numeric value
743 numeric_changes = [0] * 0x110000
744 # normalization_changes is a list of key-value pairs
745 normalization_changes = []
746 for i in range(0x110000):
747 if new.table[i] is None:
748 # Characters unassigned in the new version ought to
749 # be unassigned in the old one
750 assert old.table[i] is None
751 continue
752 # check characters unassigned in the old version
753 if old.table[i] is None:
754 # category 0 is "unassigned"
755 category_changes[i] = 0
756 continue
757 # check characters that differ
758 if old.table[i] != new.table[i]:
759 for k in range(len(old.table[i])):
760 if old.table[i][k] != new.table[i][k]:
761 value = old.table[i][k]
762 if k == 2:
763 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
764 category_changes[i] = CATEGORY_NAMES.index(value)
765 elif k == 4:
766 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
767 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
768 elif k == 5:
769 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
770 # We assume that all normalization changes are in 1:1 mappings
771 assert " " not in value
772 normalization_changes.append((i, value))
773 elif k == 6:
774 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
775 # we only support changes where the old value is a single digit
776 assert value in "0123456789"
777 decimal_changes[i] = int(value)
778 elif k == 8:
779 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
780 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000781 if not value:
782 numeric_changes[i] = -1
783 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000784 numeric_changes[i] = float(value)
785 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000786 elif k == 9:
787 if value == 'Y':
788 mirrored_changes[i] = '1'
789 else:
790 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000791 elif k == 11:
792 # change to ISO comment, ignore
793 pass
794 elif k == 12:
795 # change to simple uppercase mapping; ignore
796 pass
797 elif k == 13:
798 # change to simple lowercase mapping; ignore
799 pass
800 elif k == 14:
801 # change to simple titlecase mapping; ignore
802 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000803 elif k == 16:
804 # derived property changes; not yet
805 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000806 else:
807 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000808 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000809 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000810 decimal_changes, mirrored_changes,
811 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000812 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000813
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000814
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000815# --------------------------------------------------------------------
816# the following support code is taken from the unidb utilities
817# Copyright (c) 1999-2000 by Secret Labs AB
818
819# load a unicode-data file from disk
820
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000821class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000822 # Record structure:
823 # [ID, name, category, combining, bidi, decomp, (6)
824 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
825 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
826 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000827
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000828 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000829 derivedprops, derivednormalizationprops=None, expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000830 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000831 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000832 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000833 while 1:
834 s = file.readline()
835 if not s:
836 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000837 s = s.strip().split(";")
838 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000839 table[char] = s
840
Martin v. Löwis97225da2002-11-24 23:05:09 +0000841 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000842 if expand:
843 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000844 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000845 s = table[i]
846 if s:
847 if s[1][-6:] == "First>":
848 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000849 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000850 elif s[1][-5:] == "Last>":
851 s[1] = ""
852 field = None
853 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000854 f2 = field[:]
855 f2[0] = "%X" % i
856 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000857
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000858 # public attributes
859 self.filename = filename
860 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000861 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000862
Martin v. Löwis677bde22002-11-23 22:08:15 +0000863 file = open(exclusions)
864 self.exclusions = {}
865 for s in file:
866 s = s.strip()
867 if not s:
868 continue
869 if s[0] == '#':
870 continue
871 char = int(s.split()[0],16)
872 self.exclusions[char] = 1
873
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000874 widths = [None] * 0x110000
875 for s in open(eastasianwidth):
876 s = s.strip()
877 if not s:
878 continue
879 if s[0] == '#':
880 continue
881 s = s.split()[0].split(';')
882 if '..' in s[0]:
883 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000884 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000885 else:
886 chars = [int(s[0], 16)]
887 for char in chars:
888 widths[char] = s[1]
889 for i in range(0, 0x110000):
890 if table[i] is not None:
891 table[i].append(widths[i])
892
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000893 for i in range(0, 0x110000):
894 if table[i] is not None:
895 table[i].append(set())
896 for s in open(derivedprops):
897 s = s.split('#', 1)[0].strip()
898 if not s:
899 continue
900
901 r, p = s.split(";")
902 r = r.strip()
903 p = p.strip()
904 if ".." in r:
905 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000906 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000907 else:
908 chars = [int(r, 16)]
909 for char in chars:
910 if table[char]:
911 # Some properties (e.g. Default_Ignorable_Code_Point)
912 # apply to unassigned code points; ignore them
913 table[char][-1].add(p)
914
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000915 if derivednormalizationprops:
916 quickchecks = [0] * 0x110000 # default is Yes
917 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
918 for s in open(derivednormalizationprops):
919 if '#' in s:
920 s = s[:s.index('#')]
921 s = [i.strip() for i in s.split(';')]
922 if len(s) < 2 or s[1] not in qc_order:
923 continue
924 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
925 quickcheck_shift = qc_order.index(s[1])*2
926 quickcheck <<= quickcheck_shift
927 if '..' not in s[0]:
928 first = last = int(s[0], 16)
929 else:
930 first, last = [int(c, 16) for c in s[0].split('..')]
931 for char in range(first, last+1):
932 assert not (quickchecks[char]>>quickcheck_shift)&3
933 quickchecks[char] |= quickcheck
934 for i in range(0, 0x110000):
935 if table[i] is not None:
936 table[i].append(quickchecks[i])
937
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000938 for line in open(unihan, encoding='utf-8'):
939 if not line.startswith('U+'):
940 continue
941 code, tag, value = line.split(None, 3)[:3]
942 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
943 'kOtherNumeric'):
944 continue
945 value = value.strip().replace(',', '')
946 i = int(code[2:], 16)
947 # Patch the numeric field
948 if table[i] is not None:
949 table[i][8] = value
950
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000951 def uselatin1(self):
952 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000953 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000954
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000955# hash table tools
956
957# this is a straight-forward reimplementation of Python's built-in
958# dictionary type, using a static data structure, and a custom string
959# hash algorithm.
960
961def myhash(s, magic):
962 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000963 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000964 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000965 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000966 if ix:
967 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
968 return h
969
970SIZES = [
971 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
972 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
973 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
974 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
975]
976
977class Hash:
978 def __init__(self, name, data, magic):
979 # turn a (key, value) list into a static hash table structure
980
981 # determine table size
982 for size, poly in SIZES:
983 if size > len(data):
984 poly = size + poly
985 break
986 else:
Collin Wintera817e582007-08-22 23:05:06 +0000987 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000988
Collin Winter6afaeb72007-08-03 17:06:41 +0000989 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000990
991 table = [None] * size
992
993 mask = size-1
994
995 n = 0
996
997 hash = myhash
998
999 # initialize hash table
1000 for key, value in data:
1001 h = hash(key, magic)
1002 i = (~h) & mask
1003 v = table[i]
1004 if v is None:
1005 table[i] = value
1006 continue
1007 incr = (h ^ (h >> 3)) & mask;
1008 if not incr:
1009 incr = mask
1010 while 1:
1011 n = n + 1
1012 i = (i + incr) & mask
1013 v = table[i]
1014 if v is None:
1015 table[i] = value
1016 break
1017 incr = incr << 1
1018 if incr > mask:
1019 incr = incr ^ poly
1020
Collin Winter6afaeb72007-08-03 17:06:41 +00001021 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001022 self.collisions = n
1023
1024 for i in range(len(table)):
1025 if table[i] is None:
1026 table[i] = 0
1027
1028 self.data = Array(name + "_hash", table)
1029 self.magic = magic
1030 self.name = name
1031 self.size = size
1032 self.poly = poly
1033
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001034 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001035 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001036 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001037 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1038 file.write("#define %s_size %d\n" % (self.name, self.size))
1039 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1040
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001041# stuff to deal with arrays of unsigned integers
1042
1043class Array:
1044
1045 def __init__(self, name, data):
1046 self.name = name
1047 self.data = data
1048
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001049 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001050 # write data to file, as a C array
1051 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001052 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001053 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001054 file.write("static ")
1055 if size == 1:
1056 file.write("unsigned char")
1057 elif size == 2:
1058 file.write("unsigned short")
1059 else:
1060 file.write("unsigned int")
1061 file.write(" " + self.name + "[] = {\n")
1062 if self.data:
1063 s = " "
1064 for item in self.data:
1065 i = str(item) + ", "
1066 if len(s) + len(i) > 78:
1067 file.write(s + "\n")
1068 s = " " + i
1069 else:
1070 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001071 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001072 file.write(s + "\n")
1073 file.write("};\n\n")
1074
1075def getsize(data):
1076 # return smallest possible integer size for the given array
1077 maxdata = max(data)
1078 if maxdata < 256:
1079 return 1
1080 elif maxdata < 65536:
1081 return 2
1082 else:
1083 return 4
1084
Tim Peters21013482000-09-25 07:13:41 +00001085def splitbins(t, trace=0):
1086 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1087
1088 t is a sequence of ints. This function can be useful to save space if
1089 many of the ints are the same. t1 and t2 are lists of ints, and shift
1090 is an int, chosen to minimize the combined size of t1 and t2 (in C
1091 code), and where for each i in range(len(t)),
1092 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1093 where mask is a bitmask isolating the last "shift" bits.
1094
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001095 If optional arg trace is non-zero (default zero), progress info
1096 is printed to sys.stderr. The higher the value, the more info
1097 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001098 """
1099
Tim Peters21013482000-09-25 07:13:41 +00001100 if trace:
1101 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001102 print("%d+%d bins at shift %d; %d bytes" % (
1103 len(t1), len(t2), shift, bytes), file=sys.stderr)
1104 print("Size of original table:", len(t)*getsize(t), \
1105 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001106 n = len(t)-1 # last valid index
1107 maxshift = 0 # the most we can shift n and still have something left
1108 if n > 0:
1109 while n >> 1:
1110 n >>= 1
1111 maxshift += 1
1112 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001113 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001114 t = tuple(t) # so slices can be dict keys
1115 for shift in range(maxshift + 1):
1116 t1 = []
1117 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001118 size = 2**shift
1119 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001120 for i in range(0, len(t), size):
1121 bin = t[i:i+size]
1122 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001123 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001124 index = len(t2)
1125 bincache[bin] = index
1126 t2.extend(bin)
1127 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001128 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001129 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001130 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001131 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001132 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001133 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001134 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001135 t1, t2, shift = best
1136 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001137 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001138 dump(t1, t2, shift, bytes)
1139 if __debug__:
1140 # exhaustively verify that the decomposition is correct
1141 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001142 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001143 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1144 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001145
1146if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001147 maketables(1)