blob: d50319024c407654129d71d95cf0bd83f477f216 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
Martin v. Löwisbaecd722010-10-11 22:42:28 +000028import sys, os, zipfile
Fredrik Lundhf367cac2000-09-24 23:18:31 +000029
30SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000031VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000034UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000038UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000073# these ranges need to match unicodedata.c:is_unified_ideograph
74cjk_ranges = [
75 ('3400', '4DB5'),
76 ('4E00', '9FCB'),
77 ('20000', '2A6D6'),
78 ('2A700', '2B734'),
79 ('2B740', '2B81D')
80]
81
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000082def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000083
Collin Winter6afaeb72007-08-03 17:06:41 +000084 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000085
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +000087 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000088
Georg Brandl559e5d72008-06-11 18:37:52 +000089 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000090
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000092 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000093 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +000094 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000095 merge_old_version(version, unicode, old_unicode)
96
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000097 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000098 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000099 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000100
101# --------------------------------------------------------------------
102# unicode character properties
103
104def makeunicodedata(unicode, trace):
105
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000106 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000107 table = [dummy]
108 cache = {0: dummy}
109 index = [0] * len(unicode.chars)
110
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111 FILE = "Modules/unicodedata_db.h"
112
Collin Winter6afaeb72007-08-03 17:06:41 +0000113 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000114
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000115 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000116
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000117 for char in unicode.chars:
118 record = unicode.table[char]
119 if record:
120 # extract database properties
121 category = CATEGORY_NAMES.index(record[2])
122 combining = int(record[3])
123 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
124 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000125 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000126 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000127 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000128 category, combining, bidirectional, mirrored, eastasianwidth,
129 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000130 )
131 # add entry to index and item tables
132 i = cache.get(item)
133 if i is None:
134 cache[item] = i = len(table)
135 table.append(item)
136 index[char] = i
137
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000138 # 2) decomposition data
139
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000140 decomp_data = [0]
141 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000143 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000144
Martin v. Löwis677bde22002-11-23 22:08:15 +0000145 comp_pairs = []
146 comp_first = [None] * len(unicode.chars)
147 comp_last = [None] * len(unicode.chars)
148
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000149 for char in unicode.chars:
150 record = unicode.table[char]
151 if record:
152 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000153 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000154 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000155 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000156 # prefix
157 if decomp[0][0] == "<":
158 prefix = decomp.pop(0)
159 else:
160 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000161 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 i = decomp_prefix.index(prefix)
163 except ValueError:
164 i = len(decomp_prefix)
165 decomp_prefix.append(prefix)
166 prefix = i
167 assert prefix < 256
168 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000169 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000170 # Collect NFC pairs
171 if not prefix and len(decomp) == 3 and \
172 char not in unicode.exclusions and \
173 unicode.table[decomp[1]][3] == "0":
174 p, l, r = decomp
175 comp_first[l] = 1
176 comp_last[r] = 1
177 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 try:
179 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000180 except ValueError:
181 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 decomp_data.extend(decomp)
183 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000184 else:
185 i = 0
186 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000187
Martin v. Löwis677bde22002-11-23 22:08:15 +0000188 f = l = 0
189 comp_first_ranges = []
190 comp_last_ranges = []
191 prev_f = prev_l = None
192 for i in unicode.chars:
193 if comp_first[i] is not None:
194 comp_first[i] = f
195 f += 1
196 if prev_f is None:
197 prev_f = (i,i)
198 elif prev_f[1]+1 == i:
199 prev_f = prev_f[0],i
200 else:
201 comp_first_ranges.append(prev_f)
202 prev_f = (i,i)
203 if comp_last[i] is not None:
204 comp_last[i] = l
205 l += 1
206 if prev_l is None:
207 prev_l = (i,i)
208 elif prev_l[1]+1 == i:
209 prev_l = prev_l[0],i
210 else:
211 comp_last_ranges.append(prev_l)
212 prev_l = (i,i)
213 comp_first_ranges.append(prev_f)
214 comp_last_ranges.append(prev_l)
215 total_first = f
216 total_last = l
217
218 comp_data = [0]*(total_first*total_last)
219 for f,l,char in comp_pairs:
220 f = comp_first[f]
221 l = comp_last[l]
222 comp_data[f*total_last+l] = char
223
Collin Winter6afaeb72007-08-03 17:06:41 +0000224 print(len(table), "unique properties")
225 print(len(decomp_prefix), "unique decomposition prefixes")
226 print(len(decomp_data), "unique decomposition entries:", end=' ')
227 print(decomp_size, "bytes")
228 print(total_first, "first characters in NFC")
229 print(total_last, "last characters in NFC")
230 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000231
Collin Winter6afaeb72007-08-03 17:06:41 +0000232 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000233
Fred Drake9c685052000-10-26 03:56:46 +0000234 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000235 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
236 print(file=fp)
237 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
238 print("/* a list of unique database records */", file=fp)
239 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000240 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000241 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print("};", file=fp)
243 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000244
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print("/* Reindexing of NFC first characters. */", file=fp)
246 print("#define TOTAL_FIRST",total_first, file=fp)
247 print("#define TOTAL_LAST",total_last, file=fp)
248 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000249 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000250 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
252 print(" {0,0,0}", file=fp)
253 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000254 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000255 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
257 print(" {0,0,0}", file=fp)
258 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000259
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000260 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000261 # the support code moved into unicodedatabase.c
262
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("/* string literals */", file=fp)
264 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print(" \"%s\"," % name, file=fp)
267 print(" NULL", file=fp)
268 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000269
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000271 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" \"%s\"," % name, file=fp)
273 print(" NULL", file=fp)
274 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000275
Collin Winter6afaeb72007-08-03 17:06:41 +0000276 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000277 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" \"%s\"," % name, file=fp)
279 print(" NULL", file=fp)
280 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000281
Collin Winter6afaeb72007-08-03 17:06:41 +0000282 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print(" \"%s\"," % name, file=fp)
285 print(" NULL", file=fp)
286 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000287
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000289 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("/* index tables for the database records */", file=fp)
292 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000293 Array("index1", index1).dump(fp, trace)
294 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000296 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000297 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000298
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000300 Array("decomp_data", decomp_data).dump(fp, trace)
301
Collin Winter6afaeb72007-08-03 17:06:41 +0000302 print("/* index tables for the decomposition data */", file=fp)
303 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304 Array("decomp_index1", index1).dump(fp, trace)
305 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000306
Martin v. Löwis677bde22002-11-23 22:08:15 +0000307 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000308 print("/* NFC pairs */", file=fp)
309 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000310 Array("comp_index", index).dump(fp, trace)
311 Array("comp_data", index2).dump(fp, trace)
312
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 # Generate delta tables for old versions
314 for version, table, normalization in unicode.changed:
315 cversion = version.replace(".","_")
316 records = [table[0]]
317 cache = {table[0]:0}
318 index = [0] * len(table)
319 for i, record in enumerate(table):
320 try:
321 index[i] = cache[record]
322 except KeyError:
323 index[i] = cache[record] = len(records)
324 records.append(record)
325 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000326 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000328 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
329 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000330 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
331 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000332 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
333 print("{", file=fp)
334 print("\tint index;", file=fp)
335 print("\tif (n >= 0x110000) index = 0;", file=fp)
336 print("\telse {", file=fp)
337 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
338 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
339 (cversion, shift, ((1<<shift)-1)), file=fp)
340 print("\t}", file=fp)
341 print("\treturn change_records_%s+index;" % cversion, file=fp)
342 print("}\n", file=fp)
343 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
344 print("{", file=fp)
345 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000346 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000347 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
348 print("\tdefault: return 0;", file=fp)
349 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000350
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000351 fp.close()
352
353# --------------------------------------------------------------------
354# unicode character type tables
355
356def makeunicodetype(unicode, trace):
357
358 FILE = "Objects/unicodetype_db.h"
359
Collin Winter6afaeb72007-08-03 17:06:41 +0000360 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361
362 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000363 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000364 table = [dummy]
365 cache = {0: dummy}
366 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000367 numeric = {}
368 spaces = []
369 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000370
371 for char in unicode.chars:
372 record = unicode.table[char]
373 if record:
374 # extract database properties
375 category = record[2]
376 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000377 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000379 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000380 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
381 flags |= ALPHA_MASK
382 if category == "Ll":
383 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000384 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000385 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000386 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387 if category == "Zs" or bidirectional in ("WS", "B", "S"):
388 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000389 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000390 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 flags |= TITLE_MASK
392 if category == "Lu":
393 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000394 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000395 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000396 if "XID_Start" in properties:
397 flags |= XID_START_MASK
398 if "XID_Continue" in properties:
399 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000400 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000402 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000404 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000405 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000406 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000407 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000408 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000410 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000412 # UCD.html says that a missing title char means that
413 # it defaults to the uppercase character, not to the
414 # character itself. Apparently, in the current UCD (5.x)
415 # this feature is never used
416 title = upper
417 upper_d = upper - char
418 lower_d = lower - char
419 title_d = title - char
420 if -32768 <= upper_d <= 32767 and \
421 -32768 <= lower_d <= 32767 and \
422 -32768 <= title_d <= 32767:
423 # use deltas
424 upper = upper_d & 0xffff
425 lower = lower_d & 0xffff
426 title = title_d & 0xffff
427 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000428 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000429 # decimal digit, integer digit
430 decimal = 0
431 if record[6]:
432 flags |= DECIMAL_MASK
433 decimal = int(record[6])
434 digit = 0
435 if record[7]:
436 flags |= DIGIT_MASK
437 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000438 if record[8]:
439 flags |= NUMERIC_MASK
440 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000441 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000442 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000443 )
444 # add entry to index and item tables
445 i = cache.get(item)
446 if i is None:
447 cache[item] = i = len(table)
448 table.append(item)
449 index[char] = i
450
Collin Winter6afaeb72007-08-03 17:06:41 +0000451 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000452 print(sum(map(len, numeric.values())), "numeric code points")
453 print(len(spaces), "whitespace code points")
454 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000455
Collin Winter6afaeb72007-08-03 17:06:41 +0000456 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000457
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000458 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000459 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
460 print(file=fp)
461 print("/* a list of unique character type descriptors */", file=fp)
462 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000463 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000464 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
465 print("};", file=fp)
466 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000467
468 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000469 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000470
Collin Winter6afaeb72007-08-03 17:06:41 +0000471 print("/* type indexes */", file=fp)
472 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000473 Array("index1", index1).dump(fp, trace)
474 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000475
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000476 # Generate code for _PyUnicode_ToNumeric()
477 numeric_items = sorted(numeric.items())
478 print('/* Returns the numeric value as double for Unicode characters', file=fp)
479 print(' * having this property, -1.0 otherwise.', file=fp)
480 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000481 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000482 print('{', file=fp)
483 print(' switch (ch) {', file=fp)
484 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000485 # Turn text into float literals
486 parts = value.split('/')
487 parts = [repr(float(part)) for part in parts]
488 value = '/'.join(parts)
489
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000490 codepoints.sort()
491 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000492 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000494 print(' }', file=fp)
495 print(' return -1.0;', file=fp)
496 print('}', file=fp)
497 print(file=fp)
498
499 # Generate code for _PyUnicode_IsWhitespace()
500 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
501 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
502 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000503 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000505 print(' switch (ch) {', file=fp)
506
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000507 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000508 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000509 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000510
511 print(' }', file=fp)
512 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000513 print('}', file=fp)
514 print(file=fp)
515
516 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000517 print("/* Returns 1 for Unicode characters having the line break", file=fp)
518 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
519 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000520 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000521 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000522 print('{', file=fp)
523 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000524 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000525 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000526 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527
528 print(' }', file=fp)
529 print(' return 0;', file=fp)
530 print('}', file=fp)
531 print(file=fp)
532
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000533 fp.close()
534
535# --------------------------------------------------------------------
536# unicode name database
537
538def makeunicodename(unicode, trace):
539
540 FILE = "Modules/unicodename_db.h"
541
Collin Winter6afaeb72007-08-03 17:06:41 +0000542 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000543
544 # collect names
545 names = [None] * len(unicode.chars)
546
547 for char in unicode.chars:
548 record = unicode.table[char]
549 if record:
550 name = record[1].strip()
551 if name and name[0] != "<":
552 names[char] = name + chr(0)
553
Georg Brandl559e5d72008-06-11 18:37:52 +0000554 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000555
556 # collect unique words from names (note that we differ between
557 # words inside a sentence, and words ending a sentence. the
558 # latter includes the trailing null byte.
559
560 words = {}
561 n = b = 0
562 for char in unicode.chars:
563 name = names[char]
564 if name:
565 w = name.split()
566 b = b + len(name)
567 n = n + len(w)
568 for w in w:
569 l = words.get(w)
570 if l:
571 l.append(None)
572 else:
573 words[w] = [len(words)]
574
Collin Winter6afaeb72007-08-03 17:06:41 +0000575 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000576
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000577 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000578
Martin v. Löwis97225da2002-11-24 23:05:09 +0000579 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000580 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000581 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000582 return -len(alist), aword
583 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000584
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000585 # figure out how many phrasebook escapes we need
586 escapes = 0
587 while escapes * 256 < len(wordlist):
588 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000589 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590
591 short = 256 - escapes
592
593 assert short > 0
594
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000596
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000597 # statistics
598 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000599 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000600 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000601 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000603 # pick the most commonly used words, and sort the rest on falling
604 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000606 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000607 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608 wordlist.extend(wordtail)
609
610 # generate lexicon from words
611
612 lexicon_offset = [0]
613 lexicon = ""
614 words = {}
615
616 # build a lexicon string
617 offset = 0
618 for w, x in wordlist:
619 # encoding: bit 7 indicates last character in word (chr(128)
620 # indicates the last character in an entire string)
621 ww = w[:-1] + chr(ord(w[-1])+128)
622 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000623 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000624 if o < 0:
625 o = offset
626 lexicon = lexicon + ww
627 offset = offset + len(w)
628 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000629 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000631 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000632
633 # generate phrasebook from names and lexicon
634 phrasebook = [0]
635 phrasebook_offset = [0] * len(unicode.chars)
636 for char in unicode.chars:
637 name = names[char]
638 if name:
639 w = name.split()
640 phrasebook_offset[char] = len(phrasebook)
641 for w in w:
642 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643 if i < short:
644 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000645 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000646 # store as two bytes
647 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000648 phrasebook.append(i&255)
649
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000650 assert getsize(phrasebook) == 1
651
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000652 #
653 # unicode name hash table
654
655 # extract names
656 data = []
657 for char in unicode.chars:
658 record = unicode.table[char]
659 if record:
660 name = record[1].strip()
661 if name and name[0] != "<":
662 data.append((name, char))
663
664 # the magic number 47 was chosen to minimize the number of
665 # collisions on the current data set. if you like, change it
666 # and see what happens...
667
668 codehash = Hash("code", data, 47)
669
Collin Winter6afaeb72007-08-03 17:06:41 +0000670 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671
672 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000673 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
674 print(file=fp)
675 print("#define NAME_MAXLEN", 256, file=fp)
676 print(file=fp)
677 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000678 Array("lexicon", lexicon).dump(fp, trace)
679 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000680
681 # split decomposition index table
682 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
683
Collin Winter6afaeb72007-08-03 17:06:41 +0000684 print("/* code->name phrasebook */", file=fp)
685 print("#define phrasebook_shift", shift, file=fp)
686 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000687
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000688 Array("phrasebook", phrasebook).dump(fp, trace)
689 Array("phrasebook_offset1", offset1).dump(fp, trace)
690 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000691
Collin Winter6afaeb72007-08-03 17:06:41 +0000692 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000693 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694
695 fp.close()
696
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000697
698def merge_old_version(version, new, old):
699 # Changes to exclusion file not implemented yet
700 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000701 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702
703 # In these change records, 0xFF means "no change"
704 bidir_changes = [0xFF]*0x110000
705 category_changes = [0xFF]*0x110000
706 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000707 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000708 # In numeric data, 0 means "no change",
709 # -1 means "did not have a numeric value
710 numeric_changes = [0] * 0x110000
711 # normalization_changes is a list of key-value pairs
712 normalization_changes = []
713 for i in range(0x110000):
714 if new.table[i] is None:
715 # Characters unassigned in the new version ought to
716 # be unassigned in the old one
717 assert old.table[i] is None
718 continue
719 # check characters unassigned in the old version
720 if old.table[i] is None:
721 # category 0 is "unassigned"
722 category_changes[i] = 0
723 continue
724 # check characters that differ
725 if old.table[i] != new.table[i]:
726 for k in range(len(old.table[i])):
727 if old.table[i][k] != new.table[i][k]:
728 value = old.table[i][k]
729 if k == 2:
730 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
731 category_changes[i] = CATEGORY_NAMES.index(value)
732 elif k == 4:
733 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
734 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
735 elif k == 5:
736 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
737 # We assume that all normalization changes are in 1:1 mappings
738 assert " " not in value
739 normalization_changes.append((i, value))
740 elif k == 6:
741 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
742 # we only support changes where the old value is a single digit
743 assert value in "0123456789"
744 decimal_changes[i] = int(value)
745 elif k == 8:
746 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
747 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000748 if not value:
749 numeric_changes[i] = -1
750 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000751 numeric_changes[i] = float(value)
752 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000753 elif k == 9:
754 if value == 'Y':
755 mirrored_changes[i] = '1'
756 else:
757 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000758 elif k == 11:
759 # change to ISO comment, ignore
760 pass
761 elif k == 12:
762 # change to simple uppercase mapping; ignore
763 pass
764 elif k == 13:
765 # change to simple lowercase mapping; ignore
766 pass
767 elif k == 14:
768 # change to simple titlecase mapping; ignore
769 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000770 elif k == 16:
771 # derived property changes; not yet
772 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000773 elif k == 17:
774 # normalization quickchecks are not performed
775 # for older versions
776 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000777 else:
778 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000779 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000780 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000781 decimal_changes, mirrored_changes,
782 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000783 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000784
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000785def open_data(template, version):
786 local = template % ('-'+version,)
787 if not os.path.exists(local):
788 import urllib.request
789 if version == '3.2.0':
790 # irregular url structure
791 url = 'http://www.unicode.org/Public/3.2-Update/' + local
792 else:
793 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
794 urllib.request.urlretrieve(url, filename=local)
795 if local.endswith('.txt'):
796 return open(local, encoding='utf-8')
797 else:
798 # Unihan.zip
799 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000800
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000801# --------------------------------------------------------------------
802# the following support code is taken from the unidb utilities
803# Copyright (c) 1999-2000 by Secret Labs AB
804
805# load a unicode-data file from disk
806
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000807class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000808 # Record structure:
809 # [ID, name, category, combining, bidi, decomp, (6)
810 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
811 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
812 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000813
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000814 def __init__(self, version,
815 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000816 expand=1,
817 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000818 self.changed = []
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000819 file = open_data(UNICODE_DATA, version)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000820 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000821 while 1:
822 s = file.readline()
823 if not s:
824 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000825 s = s.strip().split(";")
826 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000827 table[char] = s
828
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000829 cjk_ranges_found = []
830
Martin v. Löwis97225da2002-11-24 23:05:09 +0000831 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000832 if expand:
833 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000834 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000835 s = table[i]
836 if s:
837 if s[1][-6:] == "First>":
838 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000839 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000840 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000841 if s[1].startswith("<CJK Ideograph"):
842 cjk_ranges_found.append((field[0],
843 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000844 s[1] = ""
845 field = None
846 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000847 f2 = field[:]
848 f2[0] = "%X" % i
849 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000850 if cjk_check and cjk_ranges != cjk_ranges_found:
851 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000852
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000853 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000854 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000855 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000856 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000857
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000858 file = open_data(COMPOSITION_EXCLUSIONS, version)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 self.exclusions = {}
860 for s in file:
861 s = s.strip()
862 if not s:
863 continue
864 if s[0] == '#':
865 continue
866 char = int(s.split()[0],16)
867 self.exclusions[char] = 1
868
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000869 widths = [None] * 0x110000
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000870 for s in open_data(EASTASIAN_WIDTH, version):
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000871 s = s.strip()
872 if not s:
873 continue
874 if s[0] == '#':
875 continue
876 s = s.split()[0].split(';')
877 if '..' in s[0]:
878 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000879 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000880 else:
881 chars = [int(s[0], 16)]
882 for char in chars:
883 widths[char] = s[1]
884 for i in range(0, 0x110000):
885 if table[i] is not None:
886 table[i].append(widths[i])
887
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000888 for i in range(0, 0x110000):
889 if table[i] is not None:
890 table[i].append(set())
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000891 for s in open_data(DERIVED_CORE_PROPERTIES, version):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000892 s = s.split('#', 1)[0].strip()
893 if not s:
894 continue
895
896 r, p = s.split(";")
897 r = r.strip()
898 p = p.strip()
899 if ".." in r:
900 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000901 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000902 else:
903 chars = [int(r, 16)]
904 for char in chars:
905 if table[char]:
906 # Some properties (e.g. Default_Ignorable_Code_Point)
907 # apply to unassigned code points; ignore them
908 table[char][-1].add(p)
909
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000910 for s in open_data(LINE_BREAK, version):
911 s = s.partition('#')[0]
912 s = [i.strip() for i in s.split(';')]
913 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
914 continue
915 if '..' not in s[0]:
916 first = last = int(s[0], 16)
917 else:
918 first, last = [int(c, 16) for c in s[0].split('..')]
919 for char in range(first, last+1):
920 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000921
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000922 # We only want the quickcheck properties
923 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
924 # Yes is the default, hence only N and M occur
925 # In 3.2.0, the format was different (NF?_NO)
926 # The parsing will incorrectly determine these as
927 # "yes", however, unicodedata.c will not perform quickchecks
928 # for older versions, and no delta records will be created.
929 quickchecks = [0] * 0x110000
930 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
931 for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
932 if '#' in s:
933 s = s[:s.index('#')]
934 s = [i.strip() for i in s.split(';')]
935 if len(s) < 2 or s[1] not in qc_order:
936 continue
937 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
938 quickcheck_shift = qc_order.index(s[1])*2
939 quickcheck <<= quickcheck_shift
940 if '..' not in s[0]:
941 first = last = int(s[0], 16)
942 else:
943 first, last = [int(c, 16) for c in s[0].split('..')]
944 for char in range(first, last+1):
945 assert not (quickchecks[char]>>quickcheck_shift)&3
946 quickchecks[char] |= quickcheck
947 for i in range(0, 0x110000):
948 if table[i] is not None:
949 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000950
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000951 zip = zipfile.ZipFile(open_data(UNIHAN, version))
952 if version == '3.2.0':
953 data = zip.open('Unihan-3.2.0.txt').read()
954 else:
955 data = zip.open('Unihan_NumericValues.txt').read()
956 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000957 if not line.startswith('U+'):
958 continue
959 code, tag, value = line.split(None, 3)[:3]
960 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
961 'kOtherNumeric'):
962 continue
963 value = value.strip().replace(',', '')
964 i = int(code[2:], 16)
965 # Patch the numeric field
966 if table[i] is not None:
967 table[i][8] = value
968
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000969 def uselatin1(self):
970 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000971 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000972
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000973# hash table tools
974
975# this is a straight-forward reimplementation of Python's built-in
976# dictionary type, using a static data structure, and a custom string
977# hash algorithm.
978
979def myhash(s, magic):
980 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000981 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000982 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000983 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000984 if ix:
985 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
986 return h
987
988SIZES = [
989 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
990 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
991 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
992 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
993]
994
995class Hash:
996 def __init__(self, name, data, magic):
997 # turn a (key, value) list into a static hash table structure
998
999 # determine table size
1000 for size, poly in SIZES:
1001 if size > len(data):
1002 poly = size + poly
1003 break
1004 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001005 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001006
Collin Winter6afaeb72007-08-03 17:06:41 +00001007 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001008
1009 table = [None] * size
1010
1011 mask = size-1
1012
1013 n = 0
1014
1015 hash = myhash
1016
1017 # initialize hash table
1018 for key, value in data:
1019 h = hash(key, magic)
1020 i = (~h) & mask
1021 v = table[i]
1022 if v is None:
1023 table[i] = value
1024 continue
1025 incr = (h ^ (h >> 3)) & mask;
1026 if not incr:
1027 incr = mask
1028 while 1:
1029 n = n + 1
1030 i = (i + incr) & mask
1031 v = table[i]
1032 if v is None:
1033 table[i] = value
1034 break
1035 incr = incr << 1
1036 if incr > mask:
1037 incr = incr ^ poly
1038
Collin Winter6afaeb72007-08-03 17:06:41 +00001039 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001040 self.collisions = n
1041
1042 for i in range(len(table)):
1043 if table[i] is None:
1044 table[i] = 0
1045
1046 self.data = Array(name + "_hash", table)
1047 self.magic = magic
1048 self.name = name
1049 self.size = size
1050 self.poly = poly
1051
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001052 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001053 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001054 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001055 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1056 file.write("#define %s_size %d\n" % (self.name, self.size))
1057 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1058
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001059# stuff to deal with arrays of unsigned integers
1060
1061class Array:
1062
1063 def __init__(self, name, data):
1064 self.name = name
1065 self.data = data
1066
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001067 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001068 # write data to file, as a C array
1069 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001070 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001071 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001072 file.write("static ")
1073 if size == 1:
1074 file.write("unsigned char")
1075 elif size == 2:
1076 file.write("unsigned short")
1077 else:
1078 file.write("unsigned int")
1079 file.write(" " + self.name + "[] = {\n")
1080 if self.data:
1081 s = " "
1082 for item in self.data:
1083 i = str(item) + ", "
1084 if len(s) + len(i) > 78:
1085 file.write(s + "\n")
1086 s = " " + i
1087 else:
1088 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001089 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001090 file.write(s + "\n")
1091 file.write("};\n\n")
1092
1093def getsize(data):
1094 # return smallest possible integer size for the given array
1095 maxdata = max(data)
1096 if maxdata < 256:
1097 return 1
1098 elif maxdata < 65536:
1099 return 2
1100 else:
1101 return 4
1102
Tim Peters21013482000-09-25 07:13:41 +00001103def splitbins(t, trace=0):
1104 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1105
1106 t is a sequence of ints. This function can be useful to save space if
1107 many of the ints are the same. t1 and t2 are lists of ints, and shift
1108 is an int, chosen to minimize the combined size of t1 and t2 (in C
1109 code), and where for each i in range(len(t)),
1110 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1111 where mask is a bitmask isolating the last "shift" bits.
1112
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001113 If optional arg trace is non-zero (default zero), progress info
1114 is printed to sys.stderr. The higher the value, the more info
1115 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001116 """
1117
Tim Peters21013482000-09-25 07:13:41 +00001118 if trace:
1119 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001120 print("%d+%d bins at shift %d; %d bytes" % (
1121 len(t1), len(t2), shift, bytes), file=sys.stderr)
1122 print("Size of original table:", len(t)*getsize(t), \
1123 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001124 n = len(t)-1 # last valid index
1125 maxshift = 0 # the most we can shift n and still have something left
1126 if n > 0:
1127 while n >> 1:
1128 n >>= 1
1129 maxshift += 1
1130 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001131 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001132 t = tuple(t) # so slices can be dict keys
1133 for shift in range(maxshift + 1):
1134 t1 = []
1135 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001136 size = 2**shift
1137 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001138 for i in range(0, len(t), size):
1139 bin = t[i:i+size]
1140 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001141 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001142 index = len(t2)
1143 bincache[bin] = index
1144 t2.extend(bin)
1145 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001146 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001147 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001148 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001149 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001150 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001151 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001152 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001153 t1, t2, shift = best
1154 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001155 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001156 dump(t1, t2, shift, bytes)
1157 if __debug__:
1158 # exhaustively verify that the decomposition is correct
1159 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001160 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001161 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1162 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001163
1164if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001165 maketables(1)