blob: 0783f1735ded340b30d72fe31d5d2e501e0d687b [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
Martin v. Löwisbaecd722010-10-11 22:42:28 +000028import sys, os, zipfile
Fredrik Lundhf367cac2000-09-24 23:18:31 +000029
30SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000031VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000034UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000038UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000073def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074
Collin Winter6afaeb72007-08-03 17:06:41 +000075 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000076
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +000078 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000079
Georg Brandl559e5d72008-06-11 18:37:52 +000080 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000081
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000083 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwisbaecd722010-10-11 22:42:28 +000084 old_unicode = UnicodeData(version)
Georg Brandl559e5d72008-06-11 18:37:52 +000085 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 merge_old_version(version, unicode, old_unicode)
87
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000088 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000089 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000090 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000091
92# --------------------------------------------------------------------
93# unicode character properties
94
95def makeunicodedata(unicode, trace):
96
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000097 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000098 table = [dummy]
99 cache = {0: dummy}
100 index = [0] * len(unicode.chars)
101
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000102 FILE = "Modules/unicodedata_db.h"
103
Collin Winter6afaeb72007-08-03 17:06:41 +0000104 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000105
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000106 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 for char in unicode.chars:
109 record = unicode.table[char]
110 if record:
111 # extract database properties
112 category = CATEGORY_NAMES.index(record[2])
113 combining = int(record[3])
114 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
115 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000116 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000117 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000119 category, combining, bidirectional, mirrored, eastasianwidth,
120 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000121 )
122 # add entry to index and item tables
123 i = cache.get(item)
124 if i is None:
125 cache[item] = i = len(table)
126 table.append(item)
127 index[char] = i
128
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000129 # 2) decomposition data
130
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 decomp_data = [0]
132 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000133 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000134 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000135
Martin v. Löwis677bde22002-11-23 22:08:15 +0000136 comp_pairs = []
137 comp_first = [None] * len(unicode.chars)
138 comp_last = [None] * len(unicode.chars)
139
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000140 for char in unicode.chars:
141 record = unicode.table[char]
142 if record:
143 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000144 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000145 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000146 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000147 # prefix
148 if decomp[0][0] == "<":
149 prefix = decomp.pop(0)
150 else:
151 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000152 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 i = decomp_prefix.index(prefix)
154 except ValueError:
155 i = len(decomp_prefix)
156 decomp_prefix.append(prefix)
157 prefix = i
158 assert prefix < 256
159 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000160 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000161 # Collect NFC pairs
162 if not prefix and len(decomp) == 3 and \
163 char not in unicode.exclusions and \
164 unicode.table[decomp[1]][3] == "0":
165 p, l, r = decomp
166 comp_first[l] = 1
167 comp_last[r] = 1
168 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 try:
170 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 except ValueError:
172 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 decomp_data.extend(decomp)
174 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000175 else:
176 i = 0
177 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000178
Martin v. Löwis677bde22002-11-23 22:08:15 +0000179 f = l = 0
180 comp_first_ranges = []
181 comp_last_ranges = []
182 prev_f = prev_l = None
183 for i in unicode.chars:
184 if comp_first[i] is not None:
185 comp_first[i] = f
186 f += 1
187 if prev_f is None:
188 prev_f = (i,i)
189 elif prev_f[1]+1 == i:
190 prev_f = prev_f[0],i
191 else:
192 comp_first_ranges.append(prev_f)
193 prev_f = (i,i)
194 if comp_last[i] is not None:
195 comp_last[i] = l
196 l += 1
197 if prev_l is None:
198 prev_l = (i,i)
199 elif prev_l[1]+1 == i:
200 prev_l = prev_l[0],i
201 else:
202 comp_last_ranges.append(prev_l)
203 prev_l = (i,i)
204 comp_first_ranges.append(prev_f)
205 comp_last_ranges.append(prev_l)
206 total_first = f
207 total_last = l
208
209 comp_data = [0]*(total_first*total_last)
210 for f,l,char in comp_pairs:
211 f = comp_first[f]
212 l = comp_last[l]
213 comp_data[f*total_last+l] = char
214
Collin Winter6afaeb72007-08-03 17:06:41 +0000215 print(len(table), "unique properties")
216 print(len(decomp_prefix), "unique decomposition prefixes")
217 print(len(decomp_data), "unique decomposition entries:", end=' ')
218 print(decomp_size, "bytes")
219 print(total_first, "first characters in NFC")
220 print(total_last, "last characters in NFC")
221 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000222
Collin Winter6afaeb72007-08-03 17:06:41 +0000223 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000224
Fred Drake9c685052000-10-26 03:56:46 +0000225 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000226 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
227 print(file=fp)
228 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
229 print("/* a list of unique database records */", file=fp)
230 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000231 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000232 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("};", file=fp)
234 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000235
Collin Winter6afaeb72007-08-03 17:06:41 +0000236 print("/* Reindexing of NFC first characters. */", file=fp)
237 print("#define TOTAL_FIRST",total_first, file=fp)
238 print("#define TOTAL_LAST",total_last, file=fp)
239 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000240 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000241 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000242 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
243 print(" {0,0,0}", file=fp)
244 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000245 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000246 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000247 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
248 print(" {0,0,0}", file=fp)
249 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000250
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000251 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000252 # the support code moved into unicodedatabase.c
253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print("/* string literals */", file=fp)
255 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000256 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print(" \"%s\"," % name, file=fp)
258 print(" NULL", file=fp)
259 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000260
Collin Winter6afaeb72007-08-03 17:06:41 +0000261 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print(" \"%s\"," % name, file=fp)
264 print(" NULL", file=fp)
265 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000268 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print(" \"%s\"," % name, file=fp)
270 print(" NULL", file=fp)
271 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000274 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000275 print(" \"%s\"," % name, file=fp)
276 print(" NULL", file=fp)
277 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000278
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000279 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000280 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000281
Collin Winter6afaeb72007-08-03 17:06:41 +0000282 print("/* index tables for the database records */", file=fp)
283 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284 Array("index1", index1).dump(fp, trace)
285 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000286
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000287 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000288 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000289
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000291 Array("decomp_data", decomp_data).dump(fp, trace)
292
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print("/* index tables for the decomposition data */", file=fp)
294 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000295 Array("decomp_index1", index1).dump(fp, trace)
296 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Martin v. Löwis677bde22002-11-23 22:08:15 +0000298 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print("/* NFC pairs */", file=fp)
300 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000301 Array("comp_index", index).dump(fp, trace)
302 Array("comp_data", index2).dump(fp, trace)
303
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000304 # Generate delta tables for old versions
305 for version, table, normalization in unicode.changed:
306 cversion = version.replace(".","_")
307 records = [table[0]]
308 cache = {table[0]:0}
309 index = [0] * len(table)
310 for i, record in enumerate(table):
311 try:
312 index[i] = cache[record]
313 except KeyError:
314 index[i] = cache[record] = len(records)
315 records.append(record)
316 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000317 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000318 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000319 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
320 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
322 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
324 print("{", file=fp)
325 print("\tint index;", file=fp)
326 print("\tif (n >= 0x110000) index = 0;", file=fp)
327 print("\telse {", file=fp)
328 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
329 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
330 (cversion, shift, ((1<<shift)-1)), file=fp)
331 print("\t}", file=fp)
332 print("\treturn change_records_%s+index;" % cversion, file=fp)
333 print("}\n", file=fp)
334 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
335 print("{", file=fp)
336 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000337 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000338 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
339 print("\tdefault: return 0;", file=fp)
340 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000341
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000342 fp.close()
343
344# --------------------------------------------------------------------
345# unicode character type tables
346
347def makeunicodetype(unicode, trace):
348
349 FILE = "Objects/unicodetype_db.h"
350
Collin Winter6afaeb72007-08-03 17:06:41 +0000351 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000352
353 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000354 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000355 table = [dummy]
356 cache = {0: dummy}
357 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000358 numeric = {}
359 spaces = []
360 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000361
362 for char in unicode.chars:
363 record = unicode.table[char]
364 if record:
365 # extract database properties
366 category = record[2]
367 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000368 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000369 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000370 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
372 flags |= ALPHA_MASK
373 if category == "Ll":
374 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000375 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000376 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000377 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000378 if category == "Zs" or bidirectional in ("WS", "B", "S"):
379 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000380 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000381 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382 flags |= TITLE_MASK
383 if category == "Lu":
384 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000385 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000386 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000387 if "XID_Start" in properties:
388 flags |= XID_START_MASK
389 if "XID_Continue" in properties:
390 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000391 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000393 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000394 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000395 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000397 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000399 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000401 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 # UCD.html says that a missing title char means that
404 # it defaults to the uppercase character, not to the
405 # character itself. Apparently, in the current UCD (5.x)
406 # this feature is never used
407 title = upper
408 upper_d = upper - char
409 lower_d = lower - char
410 title_d = title - char
411 if -32768 <= upper_d <= 32767 and \
412 -32768 <= lower_d <= 32767 and \
413 -32768 <= title_d <= 32767:
414 # use deltas
415 upper = upper_d & 0xffff
416 lower = lower_d & 0xffff
417 title = title_d & 0xffff
418 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000419 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000420 # decimal digit, integer digit
421 decimal = 0
422 if record[6]:
423 flags |= DECIMAL_MASK
424 decimal = int(record[6])
425 digit = 0
426 if record[7]:
427 flags |= DIGIT_MASK
428 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000429 if record[8]:
430 flags |= NUMERIC_MASK
431 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000432 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000433 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000434 )
435 # add entry to index and item tables
436 i = cache.get(item)
437 if i is None:
438 cache[item] = i = len(table)
439 table.append(item)
440 index[char] = i
441
Collin Winter6afaeb72007-08-03 17:06:41 +0000442 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000443 print(sum(map(len, numeric.values())), "numeric code points")
444 print(len(spaces), "whitespace code points")
445 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000446
Collin Winter6afaeb72007-08-03 17:06:41 +0000447 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000448
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000449 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000450 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
451 print(file=fp)
452 print("/* a list of unique character type descriptors */", file=fp)
453 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000454 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000455 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
456 print("};", file=fp)
457 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000458
459 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000460 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000461
Collin Winter6afaeb72007-08-03 17:06:41 +0000462 print("/* type indexes */", file=fp)
463 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000464 Array("index1", index1).dump(fp, trace)
465 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000466
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000467 # Generate code for _PyUnicode_ToNumeric()
468 numeric_items = sorted(numeric.items())
469 print('/* Returns the numeric value as double for Unicode characters', file=fp)
470 print(' * having this property, -1.0 otherwise.', file=fp)
471 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000472 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000473 print('{', file=fp)
474 print(' switch (ch) {', file=fp)
475 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000476 # Turn text into float literals
477 parts = value.split('/')
478 parts = [repr(float(part)) for part in parts]
479 value = '/'.join(parts)
480
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000481 codepoints.sort()
482 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000483 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000484 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000485 print(' }', file=fp)
486 print(' return -1.0;', file=fp)
487 print('}', file=fp)
488 print(file=fp)
489
490 # Generate code for _PyUnicode_IsWhitespace()
491 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
492 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
493 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000494 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000495 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000496 print(' switch (ch) {', file=fp)
497
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000498 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000499 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000500 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000501
502 print(' }', file=fp)
503 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000504 print('}', file=fp)
505 print(file=fp)
506
507 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000508 print("/* Returns 1 for Unicode characters having the line break", file=fp)
509 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
510 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000511 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000512 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000513 print('{', file=fp)
514 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000515 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000516 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000517 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000518
519 print(' }', file=fp)
520 print(' return 0;', file=fp)
521 print('}', file=fp)
522 print(file=fp)
523
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000524 fp.close()
525
526# --------------------------------------------------------------------
527# unicode name database
528
529def makeunicodename(unicode, trace):
530
531 FILE = "Modules/unicodename_db.h"
532
Collin Winter6afaeb72007-08-03 17:06:41 +0000533 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000534
535 # collect names
536 names = [None] * len(unicode.chars)
537
538 for char in unicode.chars:
539 record = unicode.table[char]
540 if record:
541 name = record[1].strip()
542 if name and name[0] != "<":
543 names[char] = name + chr(0)
544
Georg Brandl559e5d72008-06-11 18:37:52 +0000545 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000546
547 # collect unique words from names (note that we differ between
548 # words inside a sentence, and words ending a sentence. the
549 # latter includes the trailing null byte.
550
551 words = {}
552 n = b = 0
553 for char in unicode.chars:
554 name = names[char]
555 if name:
556 w = name.split()
557 b = b + len(name)
558 n = n + len(w)
559 for w in w:
560 l = words.get(w)
561 if l:
562 l.append(None)
563 else:
564 words[w] = [len(words)]
565
Collin Winter6afaeb72007-08-03 17:06:41 +0000566 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000567
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000568 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000569
Martin v. Löwis97225da2002-11-24 23:05:09 +0000570 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000571 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000572 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000573 return -len(alist), aword
574 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000575
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000576 # figure out how many phrasebook escapes we need
577 escapes = 0
578 while escapes * 256 < len(wordlist):
579 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000580 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000581
582 short = 256 - escapes
583
584 assert short > 0
585
Collin Winter6afaeb72007-08-03 17:06:41 +0000586 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000587
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000588 # statistics
589 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000591 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000592 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000593
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000594 # pick the most commonly used words, and sort the rest on falling
595 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000597 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000598 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000599 wordlist.extend(wordtail)
600
601 # generate lexicon from words
602
603 lexicon_offset = [0]
604 lexicon = ""
605 words = {}
606
607 # build a lexicon string
608 offset = 0
609 for w, x in wordlist:
610 # encoding: bit 7 indicates last character in word (chr(128)
611 # indicates the last character in an entire string)
612 ww = w[:-1] + chr(ord(w[-1])+128)
613 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000614 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615 if o < 0:
616 o = offset
617 lexicon = lexicon + ww
618 offset = offset + len(w)
619 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000620 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000621
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000622 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000623
624 # generate phrasebook from names and lexicon
625 phrasebook = [0]
626 phrasebook_offset = [0] * len(unicode.chars)
627 for char in unicode.chars:
628 name = names[char]
629 if name:
630 w = name.split()
631 phrasebook_offset[char] = len(phrasebook)
632 for w in w:
633 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000634 if i < short:
635 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000636 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000637 # store as two bytes
638 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000639 phrasebook.append(i&255)
640
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000641 assert getsize(phrasebook) == 1
642
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000643 #
644 # unicode name hash table
645
646 # extract names
647 data = []
648 for char in unicode.chars:
649 record = unicode.table[char]
650 if record:
651 name = record[1].strip()
652 if name and name[0] != "<":
653 data.append((name, char))
654
655 # the magic number 47 was chosen to minimize the number of
656 # collisions on the current data set. if you like, change it
657 # and see what happens...
658
659 codehash = Hash("code", data, 47)
660
Collin Winter6afaeb72007-08-03 17:06:41 +0000661 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662
663 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000664 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
665 print(file=fp)
666 print("#define NAME_MAXLEN", 256, file=fp)
667 print(file=fp)
668 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000669 Array("lexicon", lexicon).dump(fp, trace)
670 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000671
672 # split decomposition index table
673 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
674
Collin Winter6afaeb72007-08-03 17:06:41 +0000675 print("/* code->name phrasebook */", file=fp)
676 print("#define phrasebook_shift", shift, file=fp)
677 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000679 Array("phrasebook", phrasebook).dump(fp, trace)
680 Array("phrasebook_offset1", offset1).dump(fp, trace)
681 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682
Collin Winter6afaeb72007-08-03 17:06:41 +0000683 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000684 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685
686 fp.close()
687
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000688
689def merge_old_version(version, new, old):
690 # Changes to exclusion file not implemented yet
691 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000692 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000693
694 # In these change records, 0xFF means "no change"
695 bidir_changes = [0xFF]*0x110000
696 category_changes = [0xFF]*0x110000
697 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000698 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000699 # In numeric data, 0 means "no change",
700 # -1 means "did not have a numeric value
701 numeric_changes = [0] * 0x110000
702 # normalization_changes is a list of key-value pairs
703 normalization_changes = []
704 for i in range(0x110000):
705 if new.table[i] is None:
706 # Characters unassigned in the new version ought to
707 # be unassigned in the old one
708 assert old.table[i] is None
709 continue
710 # check characters unassigned in the old version
711 if old.table[i] is None:
712 # category 0 is "unassigned"
713 category_changes[i] = 0
714 continue
715 # check characters that differ
716 if old.table[i] != new.table[i]:
717 for k in range(len(old.table[i])):
718 if old.table[i][k] != new.table[i][k]:
719 value = old.table[i][k]
720 if k == 2:
721 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
722 category_changes[i] = CATEGORY_NAMES.index(value)
723 elif k == 4:
724 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
725 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
726 elif k == 5:
727 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
728 # We assume that all normalization changes are in 1:1 mappings
729 assert " " not in value
730 normalization_changes.append((i, value))
731 elif k == 6:
732 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
733 # we only support changes where the old value is a single digit
734 assert value in "0123456789"
735 decimal_changes[i] = int(value)
736 elif k == 8:
737 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
738 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000739 if not value:
740 numeric_changes[i] = -1
741 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000742 numeric_changes[i] = float(value)
743 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000744 elif k == 9:
745 if value == 'Y':
746 mirrored_changes[i] = '1'
747 else:
748 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000749 elif k == 11:
750 # change to ISO comment, ignore
751 pass
752 elif k == 12:
753 # change to simple uppercase mapping; ignore
754 pass
755 elif k == 13:
756 # change to simple lowercase mapping; ignore
757 pass
758 elif k == 14:
759 # change to simple titlecase mapping; ignore
760 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000761 elif k == 16:
762 # derived property changes; not yet
763 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000764 elif k == 17:
765 # normalization quickchecks are not performed
766 # for older versions
767 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000768 else:
769 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000770 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000771 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000772 decimal_changes, mirrored_changes,
773 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000774 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000775
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000776def open_data(template, version):
777 local = template % ('-'+version,)
778 if not os.path.exists(local):
779 import urllib.request
780 if version == '3.2.0':
781 # irregular url structure
782 url = 'http://www.unicode.org/Public/3.2-Update/' + local
783 else:
784 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
785 urllib.request.urlretrieve(url, filename=local)
786 if local.endswith('.txt'):
787 return open(local, encoding='utf-8')
788 else:
789 # Unihan.zip
790 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000791
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000792# --------------------------------------------------------------------
793# the following support code is taken from the unidb utilities
794# Copyright (c) 1999-2000 by Secret Labs AB
795
796# load a unicode-data file from disk
797
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000798class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000799 # Record structure:
800 # [ID, name, category, combining, bidi, decomp, (6)
801 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
802 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
803 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000804
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000805 def __init__(self, version,
806 linebreakprops=False,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000807 expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000808 self.changed = []
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000809 file = open_data(UNICODE_DATA, version)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000810 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000811 while 1:
812 s = file.readline()
813 if not s:
814 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000815 s = s.strip().split(";")
816 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000817 table[char] = s
818
Martin v. Löwis97225da2002-11-24 23:05:09 +0000819 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000820 if expand:
821 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000822 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000823 s = table[i]
824 if s:
825 if s[1][-6:] == "First>":
826 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000827 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000828 elif s[1][-5:] == "Last>":
829 s[1] = ""
830 field = None
831 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000832 f2 = field[:]
833 f2[0] = "%X" % i
834 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000835
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000836 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000837 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000838 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000839 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000840
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000841 file = open_data(COMPOSITION_EXCLUSIONS, version)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000842 self.exclusions = {}
843 for s in file:
844 s = s.strip()
845 if not s:
846 continue
847 if s[0] == '#':
848 continue
849 char = int(s.split()[0],16)
850 self.exclusions[char] = 1
851
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000852 widths = [None] * 0x110000
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000853 for s in open_data(EASTASIAN_WIDTH, version):
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000854 s = s.strip()
855 if not s:
856 continue
857 if s[0] == '#':
858 continue
859 s = s.split()[0].split(';')
860 if '..' in s[0]:
861 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000862 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000863 else:
864 chars = [int(s[0], 16)]
865 for char in chars:
866 widths[char] = s[1]
867 for i in range(0, 0x110000):
868 if table[i] is not None:
869 table[i].append(widths[i])
870
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000871 for i in range(0, 0x110000):
872 if table[i] is not None:
873 table[i].append(set())
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000874 for s in open_data(DERIVED_CORE_PROPERTIES, version):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000875 s = s.split('#', 1)[0].strip()
876 if not s:
877 continue
878
879 r, p = s.split(";")
880 r = r.strip()
881 p = p.strip()
882 if ".." in r:
883 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000884 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000885 else:
886 chars = [int(r, 16)]
887 for char in chars:
888 if table[char]:
889 # Some properties (e.g. Default_Ignorable_Code_Point)
890 # apply to unassigned code points; ignore them
891 table[char][-1].add(p)
892
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000893 for s in open_data(LINE_BREAK, version):
894 s = s.partition('#')[0]
895 s = [i.strip() for i in s.split(';')]
896 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
897 continue
898 if '..' not in s[0]:
899 first = last = int(s[0], 16)
900 else:
901 first, last = [int(c, 16) for c in s[0].split('..')]
902 for char in range(first, last+1):
903 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000904
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000905 # We only want the quickcheck properties
906 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
907 # Yes is the default, hence only N and M occur
908 # In 3.2.0, the format was different (NF?_NO)
909 # The parsing will incorrectly determine these as
910 # "yes", however, unicodedata.c will not perform quickchecks
911 # for older versions, and no delta records will be created.
912 quickchecks = [0] * 0x110000
913 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
914 for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
915 if '#' in s:
916 s = s[:s.index('#')]
917 s = [i.strip() for i in s.split(';')]
918 if len(s) < 2 or s[1] not in qc_order:
919 continue
920 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
921 quickcheck_shift = qc_order.index(s[1])*2
922 quickcheck <<= quickcheck_shift
923 if '..' not in s[0]:
924 first = last = int(s[0], 16)
925 else:
926 first, last = [int(c, 16) for c in s[0].split('..')]
927 for char in range(first, last+1):
928 assert not (quickchecks[char]>>quickcheck_shift)&3
929 quickchecks[char] |= quickcheck
930 for i in range(0, 0x110000):
931 if table[i] is not None:
932 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000933
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000934 zip = zipfile.ZipFile(open_data(UNIHAN, version))
935 if version == '3.2.0':
936 data = zip.open('Unihan-3.2.0.txt').read()
937 else:
938 data = zip.open('Unihan_NumericValues.txt').read()
939 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000940 if not line.startswith('U+'):
941 continue
942 code, tag, value = line.split(None, 3)[:3]
943 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
944 'kOtherNumeric'):
945 continue
946 value = value.strip().replace(',', '')
947 i = int(code[2:], 16)
948 # Patch the numeric field
949 if table[i] is not None:
950 table[i][8] = value
951
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000952 def uselatin1(self):
953 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000954 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000955
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000956# hash table tools
957
958# this is a straight-forward reimplementation of Python's built-in
959# dictionary type, using a static data structure, and a custom string
960# hash algorithm.
961
962def myhash(s, magic):
963 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000964 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000965 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000966 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000967 if ix:
968 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
969 return h
970
971SIZES = [
972 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
973 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
974 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
975 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
976]
977
978class Hash:
979 def __init__(self, name, data, magic):
980 # turn a (key, value) list into a static hash table structure
981
982 # determine table size
983 for size, poly in SIZES:
984 if size > len(data):
985 poly = size + poly
986 break
987 else:
Collin Wintera817e582007-08-22 23:05:06 +0000988 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000989
Collin Winter6afaeb72007-08-03 17:06:41 +0000990 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000991
992 table = [None] * size
993
994 mask = size-1
995
996 n = 0
997
998 hash = myhash
999
1000 # initialize hash table
1001 for key, value in data:
1002 h = hash(key, magic)
1003 i = (~h) & mask
1004 v = table[i]
1005 if v is None:
1006 table[i] = value
1007 continue
1008 incr = (h ^ (h >> 3)) & mask;
1009 if not incr:
1010 incr = mask
1011 while 1:
1012 n = n + 1
1013 i = (i + incr) & mask
1014 v = table[i]
1015 if v is None:
1016 table[i] = value
1017 break
1018 incr = incr << 1
1019 if incr > mask:
1020 incr = incr ^ poly
1021
Collin Winter6afaeb72007-08-03 17:06:41 +00001022 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001023 self.collisions = n
1024
1025 for i in range(len(table)):
1026 if table[i] is None:
1027 table[i] = 0
1028
1029 self.data = Array(name + "_hash", table)
1030 self.magic = magic
1031 self.name = name
1032 self.size = size
1033 self.poly = poly
1034
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001035 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001036 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001037 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001038 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1039 file.write("#define %s_size %d\n" % (self.name, self.size))
1040 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1041
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001042# stuff to deal with arrays of unsigned integers
1043
1044class Array:
1045
1046 def __init__(self, name, data):
1047 self.name = name
1048 self.data = data
1049
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001050 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001051 # write data to file, as a C array
1052 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001053 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001054 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001055 file.write("static ")
1056 if size == 1:
1057 file.write("unsigned char")
1058 elif size == 2:
1059 file.write("unsigned short")
1060 else:
1061 file.write("unsigned int")
1062 file.write(" " + self.name + "[] = {\n")
1063 if self.data:
1064 s = " "
1065 for item in self.data:
1066 i = str(item) + ", "
1067 if len(s) + len(i) > 78:
1068 file.write(s + "\n")
1069 s = " " + i
1070 else:
1071 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001072 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001073 file.write(s + "\n")
1074 file.write("};\n\n")
1075
1076def getsize(data):
1077 # return smallest possible integer size for the given array
1078 maxdata = max(data)
1079 if maxdata < 256:
1080 return 1
1081 elif maxdata < 65536:
1082 return 2
1083 else:
1084 return 4
1085
Tim Peters21013482000-09-25 07:13:41 +00001086def splitbins(t, trace=0):
1087 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1088
1089 t is a sequence of ints. This function can be useful to save space if
1090 many of the ints are the same. t1 and t2 are lists of ints, and shift
1091 is an int, chosen to minimize the combined size of t1 and t2 (in C
1092 code), and where for each i in range(len(t)),
1093 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1094 where mask is a bitmask isolating the last "shift" bits.
1095
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001096 If optional arg trace is non-zero (default zero), progress info
1097 is printed to sys.stderr. The higher the value, the more info
1098 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001099 """
1100
Tim Peters21013482000-09-25 07:13:41 +00001101 if trace:
1102 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001103 print("%d+%d bins at shift %d; %d bytes" % (
1104 len(t1), len(t2), shift, bytes), file=sys.stderr)
1105 print("Size of original table:", len(t)*getsize(t), \
1106 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001107 n = len(t)-1 # last valid index
1108 maxshift = 0 # the most we can shift n and still have something left
1109 if n > 0:
1110 while n >> 1:
1111 n >>= 1
1112 maxshift += 1
1113 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001114 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001115 t = tuple(t) # so slices can be dict keys
1116 for shift in range(maxshift + 1):
1117 t1 = []
1118 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001119 size = 2**shift
1120 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001121 for i in range(0, len(t), size):
1122 bin = t[i:i+size]
1123 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001124 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001125 index = len(t2)
1126 bincache[bin] = index
1127 t2.extend(bin)
1128 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001129 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001130 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001131 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001132 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001133 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001134 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001135 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001136 t1, t2, shift = best
1137 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001138 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001139 dump(t1, t2, shift, bytes)
1140 if __debug__:
1141 # exhaustively verify that the decomposition is correct
1142 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001143 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001144 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1145 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001146
1147if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001148 maketables(1)