blob: 55b44c727de6141609b2d6ab77e3984a72bbe957 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
Martin v. Löwisbaecd722010-10-11 22:42:28 +000028import sys, os, zipfile
Fredrik Lundhf367cac2000-09-24 23:18:31 +000029
30SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000031VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000034UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000038UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000073# these ranges need to match unicodedata.c:is_unified_ideograph
74cjk_ranges = [
75 ('3400', '4DB5'),
76 ('4E00', '9FCB'),
77 ('20000', '2A6D6'),
78 ('2A700', '2B734'),
79 ('2B740', '2B81D')
80]
81
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000082def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000083
Collin Winter6afaeb72007-08-03 17:06:41 +000084 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000085
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +000087 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000088
Georg Brandl559e5d72008-06-11 18:37:52 +000089 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000090
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000092 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000093 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +000094 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000095 merge_old_version(version, unicode, old_unicode)
96
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000097 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000098 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000099 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000100
101# --------------------------------------------------------------------
102# unicode character properties
103
104def makeunicodedata(unicode, trace):
105
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000106 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 table = [dummy]
108 cache = {0: dummy}
109 index = [0] * len(unicode.chars)
110
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111 FILE = "Modules/unicodedata_db.h"
112
Collin Winter6afaeb72007-08-03 17:06:41 +0000113 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000114
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000115 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000116
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 for char in unicode.chars:
118 record = unicode.table[char]
119 if record:
120 # extract database properties
121 category = CATEGORY_NAMES.index(record[2])
122 combining = int(record[3])
123 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
124 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000125 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000126 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000128 category, combining, bidirectional, mirrored, eastasianwidth,
129 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000130 )
131 # add entry to index and item tables
132 i = cache.get(item)
133 if i is None:
134 cache[item] = i = len(table)
135 table.append(item)
136 index[char] = i
137
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000138 # 2) decomposition data
139
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140 decomp_data = [0]
141 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000144
Martin v. Löwis677bde22002-11-23 22:08:15 +0000145 comp_pairs = []
146 comp_first = [None] * len(unicode.chars)
147 comp_last = [None] * len(unicode.chars)
148
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149 for char in unicode.chars:
150 record = unicode.table[char]
151 if record:
152 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000153 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000154 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000155 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000156 # prefix
157 if decomp[0][0] == "<":
158 prefix = decomp.pop(0)
159 else:
160 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000161 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 i = decomp_prefix.index(prefix)
163 except ValueError:
164 i = len(decomp_prefix)
165 decomp_prefix.append(prefix)
166 prefix = i
167 assert prefix < 256
168 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000169 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000170 # Collect NFC pairs
171 if not prefix and len(decomp) == 3 and \
172 char not in unicode.exclusions and \
173 unicode.table[decomp[1]][3] == "0":
174 p, l, r = decomp
175 comp_first[l] = 1
176 comp_last[r] = 1
177 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 try:
179 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000180 except ValueError:
181 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 decomp_data.extend(decomp)
183 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000184 else:
185 i = 0
186 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000187
Martin v. Löwis677bde22002-11-23 22:08:15 +0000188 f = l = 0
189 comp_first_ranges = []
190 comp_last_ranges = []
191 prev_f = prev_l = None
192 for i in unicode.chars:
193 if comp_first[i] is not None:
194 comp_first[i] = f
195 f += 1
196 if prev_f is None:
197 prev_f = (i,i)
198 elif prev_f[1]+1 == i:
199 prev_f = prev_f[0],i
200 else:
201 comp_first_ranges.append(prev_f)
202 prev_f = (i,i)
203 if comp_last[i] is not None:
204 comp_last[i] = l
205 l += 1
206 if prev_l is None:
207 prev_l = (i,i)
208 elif prev_l[1]+1 == i:
209 prev_l = prev_l[0],i
210 else:
211 comp_last_ranges.append(prev_l)
212 prev_l = (i,i)
213 comp_first_ranges.append(prev_f)
214 comp_last_ranges.append(prev_l)
215 total_first = f
216 total_last = l
217
218 comp_data = [0]*(total_first*total_last)
219 for f,l,char in comp_pairs:
220 f = comp_first[f]
221 l = comp_last[l]
222 comp_data[f*total_last+l] = char
223
Collin Winter6afaeb72007-08-03 17:06:41 +0000224 print(len(table), "unique properties")
225 print(len(decomp_prefix), "unique decomposition prefixes")
226 print(len(decomp_data), "unique decomposition entries:", end=' ')
227 print(decomp_size, "bytes")
228 print(total_first, "first characters in NFC")
229 print(total_last, "last characters in NFC")
230 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000231
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000233
Fred Drake9c685052000-10-26 03:56:46 +0000234 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000235 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
236 print(file=fp)
237 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
238 print("/* a list of unique database records */", file=fp)
239 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000240 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000241 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print("};", file=fp)
243 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000244
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print("/* Reindexing of NFC first characters. */", file=fp)
246 print("#define TOTAL_FIRST",total_first, file=fp)
247 print("#define TOTAL_LAST",total_last, file=fp)
248 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000249 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000250 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
252 print(" {0,0,0}", file=fp)
253 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000254 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000255 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
257 print(" {0,0,0}", file=fp)
258 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000259
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000260 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000261 # the support code moved into unicodedatabase.c
262
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("/* string literals */", file=fp)
264 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print(" \"%s\"," % name, file=fp)
267 print(" NULL", file=fp)
268 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000269
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000271 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" \"%s\"," % name, file=fp)
273 print(" NULL", file=fp)
274 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000275
Collin Winter6afaeb72007-08-03 17:06:41 +0000276 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000277 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" \"%s\"," % name, file=fp)
279 print(" NULL", file=fp)
280 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000281
Collin Winter6afaeb72007-08-03 17:06:41 +0000282 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print(" \"%s\"," % name, file=fp)
285 print(" NULL", file=fp)
286 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000287
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000289 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("/* index tables for the database records */", file=fp)
292 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000293 Array("index1", index1).dump(fp, trace)
294 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000296 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000297 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000300 Array("decomp_data", decomp_data).dump(fp, trace)
301
Collin Winter6afaeb72007-08-03 17:06:41 +0000302 print("/* index tables for the decomposition data */", file=fp)
303 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304 Array("decomp_index1", index1).dump(fp, trace)
305 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000306
Martin v. Löwis677bde22002-11-23 22:08:15 +0000307 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000308 print("/* NFC pairs */", file=fp)
309 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000310 Array("comp_index", index).dump(fp, trace)
311 Array("comp_data", index2).dump(fp, trace)
312
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 # Generate delta tables for old versions
314 for version, table, normalization in unicode.changed:
315 cversion = version.replace(".","_")
316 records = [table[0]]
317 cache = {table[0]:0}
318 index = [0] * len(table)
319 for i, record in enumerate(table):
320 try:
321 index[i] = cache[record]
322 except KeyError:
323 index[i] = cache[record] = len(records)
324 records.append(record)
325 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000326 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000328 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
329 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000330 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
331 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000332 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
333 print("{", file=fp)
334 print("\tint index;", file=fp)
335 print("\tif (n >= 0x110000) index = 0;", file=fp)
336 print("\telse {", file=fp)
337 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
338 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
339 (cversion, shift, ((1<<shift)-1)), file=fp)
340 print("\t}", file=fp)
341 print("\treturn change_records_%s+index;" % cversion, file=fp)
342 print("}\n", file=fp)
343 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
344 print("{", file=fp)
345 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000346 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000347 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
348 print("\tdefault: return 0;", file=fp)
349 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000350
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000351 fp.close()
352
353# --------------------------------------------------------------------
354# unicode character type tables
355
356def makeunicodetype(unicode, trace):
357
358 FILE = "Objects/unicodetype_db.h"
359
Collin Winter6afaeb72007-08-03 17:06:41 +0000360 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361
362 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000363 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 table = [dummy]
365 cache = {0: dummy}
366 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000367 numeric = {}
368 spaces = []
369 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370
371 for char in unicode.chars:
372 record = unicode.table[char]
373 if record:
374 # extract database properties
375 category = record[2]
376 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000377 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000379 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000380 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
381 flags |= ALPHA_MASK
382 if category == "Ll":
383 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000384 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000385 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000386 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 if category == "Zs" or bidirectional in ("WS", "B", "S"):
388 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000389 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000390 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 flags |= TITLE_MASK
392 if category == "Lu":
393 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000394 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000395 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000396 if "XID_Start" in properties:
397 flags |= XID_START_MASK
398 if "XID_Continue" in properties:
399 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000400 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000402 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000404 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000405 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000406 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000407 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000408 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000410 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000412 # UCD.html says that a missing title char means that
413 # it defaults to the uppercase character, not to the
414 # character itself. Apparently, in the current UCD (5.x)
415 # this feature is never used
416 title = upper
417 upper_d = upper - char
418 lower_d = lower - char
419 title_d = title - char
420 if -32768 <= upper_d <= 32767 and \
421 -32768 <= lower_d <= 32767 and \
422 -32768 <= title_d <= 32767:
423 # use deltas
424 upper = upper_d & 0xffff
425 lower = lower_d & 0xffff
426 title = title_d & 0xffff
427 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000428 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000429 # decimal digit, integer digit
430 decimal = 0
431 if record[6]:
432 flags |= DECIMAL_MASK
433 decimal = int(record[6])
434 digit = 0
435 if record[7]:
436 flags |= DIGIT_MASK
437 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000438 if record[8]:
439 flags |= NUMERIC_MASK
440 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000441 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000442 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000443 )
444 # add entry to index and item tables
445 i = cache.get(item)
446 if i is None:
447 cache[item] = i = len(table)
448 table.append(item)
449 index[char] = i
450
Collin Winter6afaeb72007-08-03 17:06:41 +0000451 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000452 print(sum(map(len, numeric.values())), "numeric code points")
453 print(len(spaces), "whitespace code points")
454 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000455
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000457
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000459 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
460 print(file=fp)
461 print("/* a list of unique character type descriptors */", file=fp)
462 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000463 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000464 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
465 print("};", file=fp)
466 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000467
468 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000469 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000470
Collin Winter6afaeb72007-08-03 17:06:41 +0000471 print("/* type indexes */", file=fp)
472 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000473 Array("index1", index1).dump(fp, trace)
474 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000475
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000476 # Generate code for _PyUnicode_ToNumeric()
477 numeric_items = sorted(numeric.items())
478 print('/* Returns the numeric value as double for Unicode characters', file=fp)
479 print(' * having this property, -1.0 otherwise.', file=fp)
480 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000481 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000482 print('{', file=fp)
483 print(' switch (ch) {', file=fp)
484 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000485 # Turn text into float literals
486 parts = value.split('/')
487 parts = [repr(float(part)) for part in parts]
488 value = '/'.join(parts)
489
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 codepoints.sort()
491 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000492 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000494 print(' }', file=fp)
495 print(' return -1.0;', file=fp)
496 print('}', file=fp)
497 print(file=fp)
498
499 # Generate code for _PyUnicode_IsWhitespace()
500 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
501 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
502 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000503 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000505 print(' switch (ch) {', file=fp)
506
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000507 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000508 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000509 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000510
511 print(' }', file=fp)
512 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000513 print('}', file=fp)
514 print(file=fp)
515
516 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000517 print("/* Returns 1 for Unicode characters having the line break", file=fp)
518 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
519 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000520 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000521 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000522 print('{', file=fp)
523 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000524 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000525 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000526 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527
528 print(' }', file=fp)
529 print(' return 0;', file=fp)
530 print('}', file=fp)
531 print(file=fp)
532
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000533 fp.close()
534
535# --------------------------------------------------------------------
536# unicode name database
537
538def makeunicodename(unicode, trace):
539
540 FILE = "Modules/unicodename_db.h"
541
Collin Winter6afaeb72007-08-03 17:06:41 +0000542 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543
544 # collect names
545 names = [None] * len(unicode.chars)
546
547 for char in unicode.chars:
548 record = unicode.table[char]
549 if record:
550 name = record[1].strip()
551 if name and name[0] != "<":
552 names[char] = name + chr(0)
553
Georg Brandl559e5d72008-06-11 18:37:52 +0000554 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000555
556 # collect unique words from names (note that we differ between
557 # words inside a sentence, and words ending a sentence. the
558 # latter includes the trailing null byte.
559
560 words = {}
561 n = b = 0
562 for char in unicode.chars:
563 name = names[char]
564 if name:
565 w = name.split()
566 b = b + len(name)
567 n = n + len(w)
568 for w in w:
569 l = words.get(w)
570 if l:
571 l.append(None)
572 else:
573 words[w] = [len(words)]
574
Collin Winter6afaeb72007-08-03 17:06:41 +0000575 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000576
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000577 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000578
Martin v. Löwis97225da2002-11-24 23:05:09 +0000579 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000580 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000581 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000582 return -len(alist), aword
583 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000584
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000585 # figure out how many phrasebook escapes we need
586 escapes = 0
587 while escapes * 256 < len(wordlist):
588 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000589 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590
591 short = 256 - escapes
592
593 assert short > 0
594
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000596
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000597 # statistics
598 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000599 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000600 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000601 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000603 # pick the most commonly used words, and sort the rest on falling
604 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000606 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000607 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608 wordlist.extend(wordtail)
609
610 # generate lexicon from words
611
612 lexicon_offset = [0]
613 lexicon = ""
614 words = {}
615
616 # build a lexicon string
617 offset = 0
618 for w, x in wordlist:
619 # encoding: bit 7 indicates last character in word (chr(128)
620 # indicates the last character in an entire string)
621 ww = w[:-1] + chr(ord(w[-1])+128)
622 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000623 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000624 if o < 0:
625 o = offset
626 lexicon = lexicon + ww
627 offset = offset + len(w)
628 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000631 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000632
633 # generate phrasebook from names and lexicon
634 phrasebook = [0]
635 phrasebook_offset = [0] * len(unicode.chars)
636 for char in unicode.chars:
637 name = names[char]
638 if name:
639 w = name.split()
640 phrasebook_offset[char] = len(phrasebook)
641 for w in w:
642 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643 if i < short:
644 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000645 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000646 # store as two bytes
647 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000648 phrasebook.append(i&255)
649
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000650 assert getsize(phrasebook) == 1
651
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000652 #
653 # unicode name hash table
654
655 # extract names
656 data = []
657 for char in unicode.chars:
658 record = unicode.table[char]
659 if record:
660 name = record[1].strip()
661 if name and name[0] != "<":
662 data.append((name, char))
663
664 # the magic number 47 was chosen to minimize the number of
665 # collisions on the current data set. if you like, change it
666 # and see what happens...
667
668 codehash = Hash("code", data, 47)
669
Collin Winter6afaeb72007-08-03 17:06:41 +0000670 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671
672 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000673 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
674 print(file=fp)
675 print("#define NAME_MAXLEN", 256, file=fp)
676 print(file=fp)
677 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000678 Array("lexicon", lexicon).dump(fp, trace)
679 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000680
681 # split decomposition index table
682 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
683
Collin Winter6afaeb72007-08-03 17:06:41 +0000684 print("/* code->name phrasebook */", file=fp)
685 print("#define phrasebook_shift", shift, file=fp)
686 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000687
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000688 Array("phrasebook", phrasebook).dump(fp, trace)
689 Array("phrasebook_offset1", offset1).dump(fp, trace)
690 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000691
Collin Winter6afaeb72007-08-03 17:06:41 +0000692 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000693 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 fp.close()
696
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000697
698def merge_old_version(version, new, old):
699 # Changes to exclusion file not implemented yet
700 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000701 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702
703 # In these change records, 0xFF means "no change"
704 bidir_changes = [0xFF]*0x110000
705 category_changes = [0xFF]*0x110000
706 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000707 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000708 # In numeric data, 0 means "no change",
709 # -1 means "did not have a numeric value
710 numeric_changes = [0] * 0x110000
711 # normalization_changes is a list of key-value pairs
712 normalization_changes = []
713 for i in range(0x110000):
714 if new.table[i] is None:
715 # Characters unassigned in the new version ought to
716 # be unassigned in the old one
717 assert old.table[i] is None
718 continue
719 # check characters unassigned in the old version
720 if old.table[i] is None:
721 # category 0 is "unassigned"
722 category_changes[i] = 0
723 continue
724 # check characters that differ
725 if old.table[i] != new.table[i]:
726 for k in range(len(old.table[i])):
727 if old.table[i][k] != new.table[i][k]:
728 value = old.table[i][k]
729 if k == 2:
730 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
731 category_changes[i] = CATEGORY_NAMES.index(value)
732 elif k == 4:
733 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
734 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
735 elif k == 5:
736 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
737 # We assume that all normalization changes are in 1:1 mappings
738 assert " " not in value
739 normalization_changes.append((i, value))
740 elif k == 6:
741 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
742 # we only support changes where the old value is a single digit
743 assert value in "0123456789"
744 decimal_changes[i] = int(value)
745 elif k == 8:
746 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
747 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000748 if not value:
749 numeric_changes[i] = -1
750 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000751 numeric_changes[i] = float(value)
752 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000753 elif k == 9:
754 if value == 'Y':
755 mirrored_changes[i] = '1'
756 else:
757 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000758 elif k == 11:
759 # change to ISO comment, ignore
760 pass
761 elif k == 12:
762 # change to simple uppercase mapping; ignore
763 pass
764 elif k == 13:
765 # change to simple lowercase mapping; ignore
766 pass
767 elif k == 14:
768 # change to simple titlecase mapping; ignore
769 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000770 elif k == 16:
771 # derived property changes; not yet
772 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000773 elif k == 17:
774 # normalization quickchecks are not performed
775 # for older versions
776 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000777 else:
778 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000779 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000780 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000781 decimal_changes, mirrored_changes,
782 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000783 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000784
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000785def open_data(template, version):
786 local = template % ('-'+version,)
787 if not os.path.exists(local):
788 import urllib.request
789 if version == '3.2.0':
790 # irregular url structure
791 url = 'http://www.unicode.org/Public/3.2-Update/' + local
792 else:
793 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
794 urllib.request.urlretrieve(url, filename=local)
795 if local.endswith('.txt'):
796 return open(local, encoding='utf-8')
797 else:
798 # Unihan.zip
799 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000800
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000801# --------------------------------------------------------------------
802# the following support code is taken from the unidb utilities
803# Copyright (c) 1999-2000 by Secret Labs AB
804
805# load a unicode-data file from disk
806
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000807class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000808 # Record structure:
809 # [ID, name, category, combining, bidi, decomp, (6)
810 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
811 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
812 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000813
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000814 def __init__(self, version,
815 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000816 expand=1,
817 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000818 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000819 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300820 with open_data(UNICODE_DATA, version) as file:
821 while 1:
822 s = file.readline()
823 if not s:
824 break
825 s = s.strip().split(";")
826 char = int(s[0], 16)
827 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000828
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000829 cjk_ranges_found = []
830
Martin v. Löwis97225da2002-11-24 23:05:09 +0000831 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000832 if expand:
833 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000834 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000835 s = table[i]
836 if s:
837 if s[1][-6:] == "First>":
838 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000839 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000840 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000841 if s[1].startswith("<CJK Ideograph"):
842 cjk_ranges_found.append((field[0],
843 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000844 s[1] = ""
845 field = None
846 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000847 f2 = field[:]
848 f2[0] = "%X" % i
849 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000850 if cjk_check and cjk_ranges != cjk_ranges_found:
851 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000852
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000853 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000854 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000855 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000856 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000857
Martin v. Löwis677bde22002-11-23 22:08:15 +0000858 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300859 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
860 for s in file:
861 s = s.strip()
862 if not s:
863 continue
864 if s[0] == '#':
865 continue
866 char = int(s.split()[0],16)
867 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000868
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000869 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300870 with open_data(EASTASIAN_WIDTH, version) as file:
871 for s in file:
872 s = s.strip()
873 if not s:
874 continue
875 if s[0] == '#':
876 continue
877 s = s.split()[0].split(';')
878 if '..' in s[0]:
879 first, last = [int(c, 16) for c in s[0].split('..')]
880 chars = list(range(first, last+1))
881 else:
882 chars = [int(s[0], 16)]
883 for char in chars:
884 widths[char] = s[1]
885
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000886 for i in range(0, 0x110000):
887 if table[i] is not None:
888 table[i].append(widths[i])
889
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000890 for i in range(0, 0x110000):
891 if table[i] is not None:
892 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000893
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300894 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
895 for s in file:
896 s = s.split('#', 1)[0].strip()
897 if not s:
898 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000899
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300900 r, p = s.split(";")
901 r = r.strip()
902 p = p.strip()
903 if ".." in r:
904 first, last = [int(c, 16) for c in r.split('..')]
905 chars = list(range(first, last+1))
906 else:
907 chars = [int(r, 16)]
908 for char in chars:
909 if table[char]:
910 # Some properties (e.g. Default_Ignorable_Code_Point)
911 # apply to unassigned code points; ignore them
912 table[char][-1].add(p)
913
914 with open_data(LINE_BREAK, version) as file:
915 for s in file:
916 s = s.partition('#')[0]
917 s = [i.strip() for i in s.split(';')]
918 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
919 continue
920 if '..' not in s[0]:
921 first = last = int(s[0], 16)
922 else:
923 first, last = [int(c, 16) for c in s[0].split('..')]
924 for char in range(first, last+1):
925 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000926
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000927 # We only want the quickcheck properties
928 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
929 # Yes is the default, hence only N and M occur
930 # In 3.2.0, the format was different (NF?_NO)
931 # The parsing will incorrectly determine these as
932 # "yes", however, unicodedata.c will not perform quickchecks
933 # for older versions, and no delta records will be created.
934 quickchecks = [0] * 0x110000
935 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300936 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
937 for s in file:
938 if '#' in s:
939 s = s[:s.index('#')]
940 s = [i.strip() for i in s.split(';')]
941 if len(s) < 2 or s[1] not in qc_order:
942 continue
943 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
944 quickcheck_shift = qc_order.index(s[1])*2
945 quickcheck <<= quickcheck_shift
946 if '..' not in s[0]:
947 first = last = int(s[0], 16)
948 else:
949 first, last = [int(c, 16) for c in s[0].split('..')]
950 for char in range(first, last+1):
951 assert not (quickchecks[char]>>quickcheck_shift)&3
952 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000953 for i in range(0, 0x110000):
954 if table[i] is not None:
955 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000956
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300957 with open_data(UNIHAN, version) as file:
958 zip = zipfile.ZipFile(file)
959 if version == '3.2.0':
960 data = zip.open('Unihan-3.2.0.txt').read()
961 else:
962 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000963 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000964 if not line.startswith('U+'):
965 continue
966 code, tag, value = line.split(None, 3)[:3]
967 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
968 'kOtherNumeric'):
969 continue
970 value = value.strip().replace(',', '')
971 i = int(code[2:], 16)
972 # Patch the numeric field
973 if table[i] is not None:
974 table[i][8] = value
975
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000976 def uselatin1(self):
977 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000978 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000979
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000980# hash table tools
981
982# this is a straight-forward reimplementation of Python's built-in
983# dictionary type, using a static data structure, and a custom string
984# hash algorithm.
985
986def myhash(s, magic):
987 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000988 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000989 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000990 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000991 if ix:
992 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
993 return h
994
995SIZES = [
996 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
997 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
998 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
999 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1000]
1001
1002class Hash:
1003 def __init__(self, name, data, magic):
1004 # turn a (key, value) list into a static hash table structure
1005
1006 # determine table size
1007 for size, poly in SIZES:
1008 if size > len(data):
1009 poly = size + poly
1010 break
1011 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001012 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001013
Collin Winter6afaeb72007-08-03 17:06:41 +00001014 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001015
1016 table = [None] * size
1017
1018 mask = size-1
1019
1020 n = 0
1021
1022 hash = myhash
1023
1024 # initialize hash table
1025 for key, value in data:
1026 h = hash(key, magic)
1027 i = (~h) & mask
1028 v = table[i]
1029 if v is None:
1030 table[i] = value
1031 continue
1032 incr = (h ^ (h >> 3)) & mask;
1033 if not incr:
1034 incr = mask
1035 while 1:
1036 n = n + 1
1037 i = (i + incr) & mask
1038 v = table[i]
1039 if v is None:
1040 table[i] = value
1041 break
1042 incr = incr << 1
1043 if incr > mask:
1044 incr = incr ^ poly
1045
Collin Winter6afaeb72007-08-03 17:06:41 +00001046 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001047 self.collisions = n
1048
1049 for i in range(len(table)):
1050 if table[i] is None:
1051 table[i] = 0
1052
1053 self.data = Array(name + "_hash", table)
1054 self.magic = magic
1055 self.name = name
1056 self.size = size
1057 self.poly = poly
1058
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001059 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001060 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001061 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001062 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1063 file.write("#define %s_size %d\n" % (self.name, self.size))
1064 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1065
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001066# stuff to deal with arrays of unsigned integers
1067
1068class Array:
1069
1070 def __init__(self, name, data):
1071 self.name = name
1072 self.data = data
1073
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001074 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001075 # write data to file, as a C array
1076 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001077 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001078 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001079 file.write("static ")
1080 if size == 1:
1081 file.write("unsigned char")
1082 elif size == 2:
1083 file.write("unsigned short")
1084 else:
1085 file.write("unsigned int")
1086 file.write(" " + self.name + "[] = {\n")
1087 if self.data:
1088 s = " "
1089 for item in self.data:
1090 i = str(item) + ", "
1091 if len(s) + len(i) > 78:
1092 file.write(s + "\n")
1093 s = " " + i
1094 else:
1095 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001096 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001097 file.write(s + "\n")
1098 file.write("};\n\n")
1099
1100def getsize(data):
1101 # return smallest possible integer size for the given array
1102 maxdata = max(data)
1103 if maxdata < 256:
1104 return 1
1105 elif maxdata < 65536:
1106 return 2
1107 else:
1108 return 4
1109
Tim Peters21013482000-09-25 07:13:41 +00001110def splitbins(t, trace=0):
1111 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1112
1113 t is a sequence of ints. This function can be useful to save space if
1114 many of the ints are the same. t1 and t2 are lists of ints, and shift
1115 is an int, chosen to minimize the combined size of t1 and t2 (in C
1116 code), and where for each i in range(len(t)),
1117 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1118 where mask is a bitmask isolating the last "shift" bits.
1119
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001120 If optional arg trace is non-zero (default zero), progress info
1121 is printed to sys.stderr. The higher the value, the more info
1122 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001123 """
1124
Tim Peters21013482000-09-25 07:13:41 +00001125 if trace:
1126 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001127 print("%d+%d bins at shift %d; %d bytes" % (
1128 len(t1), len(t2), shift, bytes), file=sys.stderr)
1129 print("Size of original table:", len(t)*getsize(t), \
1130 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001131 n = len(t)-1 # last valid index
1132 maxshift = 0 # the most we can shift n and still have something left
1133 if n > 0:
1134 while n >> 1:
1135 n >>= 1
1136 maxshift += 1
1137 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001138 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001139 t = tuple(t) # so slices can be dict keys
1140 for shift in range(maxshift + 1):
1141 t1 = []
1142 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001143 size = 2**shift
1144 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001145 for i in range(0, len(t), size):
1146 bin = t[i:i+size]
1147 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001148 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001149 index = len(t2)
1150 bincache[bin] = index
1151 t2.extend(bin)
1152 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001153 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001154 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001155 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001156 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001157 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001158 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001159 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001160 t1, t2, shift = best
1161 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001162 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001163 dump(t1, t2, shift, bytes)
1164 if __debug__:
1165 # exhaustively verify that the decomposition is correct
1166 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001167 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001168 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1169 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001170
1171if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001172 maketables(1)