blob: 7266a91c4ec9c93771bcb6c1fd4b220a218a84e2 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000031VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Florent Xiclunafaa663f2010-03-19 13:37:08 +000034UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000073def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074
Collin Winter6afaeb72007-08-03 17:06:41 +000075 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000076
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077 version = ""
78 unicode = UnicodeData(UNICODE_DATA % version,
79 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000081 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000082 DERIVED_CORE_PROPERTIES % version,
Florent Xicluna806d8cf2010-03-30 19:34:18 +000083 DERIVEDNORMALIZATION_PROPS % version,
84 LINE_BREAK % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000085
Georg Brandl559e5d72008-06-11 18:37:52 +000086 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000087
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000089 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
91 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000093 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000094 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000095 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000096 merge_old_version(version, unicode, old_unicode)
97
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000098 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000100 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101
102# --------------------------------------------------------------------
103# unicode character properties
104
105def makeunicodedata(unicode, trace):
106
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000107 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 table = [dummy]
109 cache = {0: dummy}
110 index = [0] * len(unicode.chars)
111
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000112 FILE = "Modules/unicodedata_db.h"
113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000116 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000117
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 for char in unicode.chars:
119 record = unicode.table[char]
120 if record:
121 # extract database properties
122 category = CATEGORY_NAMES.index(record[2])
123 combining = int(record[3])
124 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
125 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000126 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000127 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000129 category, combining, bidirectional, mirrored, eastasianwidth,
130 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000131 )
132 # add entry to index and item tables
133 i = cache.get(item)
134 if i is None:
135 cache[item] = i = len(table)
136 table.append(item)
137 index[char] = i
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 # 2) decomposition data
140
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000141 decomp_data = [0]
142 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145
Martin v. Löwis677bde22002-11-23 22:08:15 +0000146 comp_pairs = []
147 comp_first = [None] * len(unicode.chars)
148 comp_last = [None] * len(unicode.chars)
149
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000150 for char in unicode.chars:
151 record = unicode.table[char]
152 if record:
153 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000154 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000155 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000156 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 # prefix
158 if decomp[0][0] == "<":
159 prefix = decomp.pop(0)
160 else:
161 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000162 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 i = decomp_prefix.index(prefix)
164 except ValueError:
165 i = len(decomp_prefix)
166 decomp_prefix.append(prefix)
167 prefix = i
168 assert prefix < 256
169 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000170 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 # Collect NFC pairs
172 if not prefix and len(decomp) == 3 and \
173 char not in unicode.exclusions and \
174 unicode.table[decomp[1]][3] == "0":
175 p, l, r = decomp
176 comp_first[l] = 1
177 comp_last[r] = 1
178 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 try:
180 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 except ValueError:
182 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 decomp_data.extend(decomp)
184 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000185 else:
186 i = 0
187 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000188
Martin v. Löwis677bde22002-11-23 22:08:15 +0000189 f = l = 0
190 comp_first_ranges = []
191 comp_last_ranges = []
192 prev_f = prev_l = None
193 for i in unicode.chars:
194 if comp_first[i] is not None:
195 comp_first[i] = f
196 f += 1
197 if prev_f is None:
198 prev_f = (i,i)
199 elif prev_f[1]+1 == i:
200 prev_f = prev_f[0],i
201 else:
202 comp_first_ranges.append(prev_f)
203 prev_f = (i,i)
204 if comp_last[i] is not None:
205 comp_last[i] = l
206 l += 1
207 if prev_l is None:
208 prev_l = (i,i)
209 elif prev_l[1]+1 == i:
210 prev_l = prev_l[0],i
211 else:
212 comp_last_ranges.append(prev_l)
213 prev_l = (i,i)
214 comp_first_ranges.append(prev_f)
215 comp_last_ranges.append(prev_l)
216 total_first = f
217 total_last = l
218
219 comp_data = [0]*(total_first*total_last)
220 for f,l,char in comp_pairs:
221 f = comp_first[f]
222 l = comp_last[l]
223 comp_data[f*total_last+l] = char
224
Collin Winter6afaeb72007-08-03 17:06:41 +0000225 print(len(table), "unique properties")
226 print(len(decomp_prefix), "unique decomposition prefixes")
227 print(len(decomp_data), "unique decomposition entries:", end=' ')
228 print(decomp_size, "bytes")
229 print(total_first, "first characters in NFC")
230 print(total_last, "last characters in NFC")
231 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000232
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000234
Fred Drake9c685052000-10-26 03:56:46 +0000235 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000236 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
237 print(file=fp)
238 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
239 print("/* a list of unique database records */", file=fp)
240 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000242 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000243 print("};", file=fp)
244 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print("/* Reindexing of NFC first characters. */", file=fp)
247 print("#define TOTAL_FIRST",total_first, file=fp)
248 print("#define TOTAL_LAST",total_last, file=fp)
249 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000250 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000251 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
253 print(" {0,0,0}", file=fp)
254 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000255 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
258 print(" {0,0,0}", file=fp)
259 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000260
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000261 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000262 # the support code moved into unicodedatabase.c
263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("/* string literals */", file=fp)
265 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print(" \"%s\"," % name, file=fp)
268 print(" NULL", file=fp)
269 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" \"%s\"," % name, file=fp)
274 print(" NULL", file=fp)
275 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000276
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print(" \"%s\"," % name, file=fp)
280 print(" NULL", file=fp)
281 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000282
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print(" \"%s\"," % name, file=fp)
286 print(" NULL", file=fp)
287 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000288
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000289 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000290 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("/* index tables for the database records */", file=fp)
293 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000294 Array("index1", index1).dump(fp, trace)
295 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000297 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000298 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000299
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_data", decomp_data).dump(fp, trace)
302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("/* index tables for the decomposition data */", file=fp)
304 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 Array("decomp_index1", index1).dump(fp, trace)
306 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000307
Martin v. Löwis677bde22002-11-23 22:08:15 +0000308 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print("/* NFC pairs */", file=fp)
310 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000311 Array("comp_index", index).dump(fp, trace)
312 Array("comp_data", index2).dump(fp, trace)
313
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 # Generate delta tables for old versions
315 for version, table, normalization in unicode.changed:
316 cversion = version.replace(".","_")
317 records = [table[0]]
318 cache = {table[0]:0}
319 index = [0] * len(table)
320 for i, record in enumerate(table):
321 try:
322 index[i] = cache[record]
323 except KeyError:
324 index[i] = cache[record] = len(records)
325 records.append(record)
326 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000327 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
330 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000331 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
332 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000333 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
334 print("{", file=fp)
335 print("\tint index;", file=fp)
336 print("\tif (n >= 0x110000) index = 0;", file=fp)
337 print("\telse {", file=fp)
338 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
339 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
340 (cversion, shift, ((1<<shift)-1)), file=fp)
341 print("\t}", file=fp)
342 print("\treturn change_records_%s+index;" % cversion, file=fp)
343 print("}\n", file=fp)
344 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
345 print("{", file=fp)
346 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
349 print("\tdefault: return 0;", file=fp)
350 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000351
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000352 fp.close()
353
354# --------------------------------------------------------------------
355# unicode character type tables
356
357def makeunicodetype(unicode, trace):
358
359 FILE = "Objects/unicodetype_db.h"
360
Collin Winter6afaeb72007-08-03 17:06:41 +0000361 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000364 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365 table = [dummy]
366 cache = {0: dummy}
367 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000368 numeric = {}
369 spaces = []
370 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371
372 for char in unicode.chars:
373 record = unicode.table[char]
374 if record:
375 # extract database properties
376 category = record[2]
377 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000378 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000380 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
382 flags |= ALPHA_MASK
383 if category == "Ll":
384 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000385 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000386 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000387 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 if category == "Zs" or bidirectional in ("WS", "B", "S"):
389 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000390 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000391 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 flags |= TITLE_MASK
393 if category == "Lu":
394 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000395 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000396 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000397 if "XID_Start" in properties:
398 flags |= XID_START_MASK
399 if "XID_Continue" in properties:
400 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000411 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000413 # UCD.html says that a missing title char means that
414 # it defaults to the uppercase character, not to the
415 # character itself. Apparently, in the current UCD (5.x)
416 # this feature is never used
417 title = upper
418 upper_d = upper - char
419 lower_d = lower - char
420 title_d = title - char
421 if -32768 <= upper_d <= 32767 and \
422 -32768 <= lower_d <= 32767 and \
423 -32768 <= title_d <= 32767:
424 # use deltas
425 upper = upper_d & 0xffff
426 lower = lower_d & 0xffff
427 title = title_d & 0xffff
428 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000429 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000430 # decimal digit, integer digit
431 decimal = 0
432 if record[6]:
433 flags |= DECIMAL_MASK
434 decimal = int(record[6])
435 digit = 0
436 if record[7]:
437 flags |= DIGIT_MASK
438 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000439 if record[8]:
440 flags |= NUMERIC_MASK
441 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000443 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000444 )
445 # add entry to index and item tables
446 i = cache.get(item)
447 if i is None:
448 cache[item] = i = len(table)
449 table.append(item)
450 index[char] = i
451
Collin Winter6afaeb72007-08-03 17:06:41 +0000452 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000453 print(sum(map(len, numeric.values())), "numeric code points")
454 print(len(spaces), "whitespace code points")
455 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000456
Collin Winter6afaeb72007-08-03 17:06:41 +0000457 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000458
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000459 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000460 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
461 print(file=fp)
462 print("/* a list of unique character type descriptors */", file=fp)
463 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000465 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
466 print("};", file=fp)
467 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000468
469 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000470 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000471
Collin Winter6afaeb72007-08-03 17:06:41 +0000472 print("/* type indexes */", file=fp)
473 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000474 Array("index1", index1).dump(fp, trace)
475 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000476
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000477 # Generate code for _PyUnicode_ToNumeric()
478 numeric_items = sorted(numeric.items())
479 print('/* Returns the numeric value as double for Unicode characters', file=fp)
480 print(' * having this property, -1.0 otherwise.', file=fp)
481 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000482 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000483 print('{', file=fp)
484 print(' switch (ch) {', file=fp)
485 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000486 # Turn text into float literals
487 parts = value.split('/')
488 parts = [repr(float(part)) for part in parts]
489 value = '/'.join(parts)
490
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000491 codepoints.sort()
492 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000494 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000495 print(' }', file=fp)
496 print(' return -1.0;', file=fp)
497 print('}', file=fp)
498 print(file=fp)
499
500 # Generate code for _PyUnicode_IsWhitespace()
501 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
502 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
503 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000504 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000505 print('{', file=fp)
506 print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
507 print(' return iswspace(ch);', file=fp)
508 print('#else', file=fp)
509 print(' switch (ch) {', file=fp)
510
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000511 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000512 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000513 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000514
515 print(' }', file=fp)
516 print(' return 0;', file=fp)
517 print('#endif', file=fp)
518 print('}', file=fp)
519 print(file=fp)
520
521 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000522 print("/* Returns 1 for Unicode characters having the line break", file=fp)
523 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
524 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000525 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000526 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527 print('{', file=fp)
528 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000529 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000530 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000531 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000532
533 print(' }', file=fp)
534 print(' return 0;', file=fp)
535 print('}', file=fp)
536 print(file=fp)
537
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000538 fp.close()
539
540# --------------------------------------------------------------------
541# unicode name database
542
543def makeunicodename(unicode, trace):
544
545 FILE = "Modules/unicodename_db.h"
546
Collin Winter6afaeb72007-08-03 17:06:41 +0000547 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000548
549 # collect names
550 names = [None] * len(unicode.chars)
551
552 for char in unicode.chars:
553 record = unicode.table[char]
554 if record:
555 name = record[1].strip()
556 if name and name[0] != "<":
557 names[char] = name + chr(0)
558
Georg Brandl559e5d72008-06-11 18:37:52 +0000559 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000560
561 # collect unique words from names (note that we differ between
562 # words inside a sentence, and words ending a sentence. the
563 # latter includes the trailing null byte.
564
565 words = {}
566 n = b = 0
567 for char in unicode.chars:
568 name = names[char]
569 if name:
570 w = name.split()
571 b = b + len(name)
572 n = n + len(w)
573 for w in w:
574 l = words.get(w)
575 if l:
576 l.append(None)
577 else:
578 words[w] = [len(words)]
579
Collin Winter6afaeb72007-08-03 17:06:41 +0000580 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000581
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000582 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000583
Martin v. Löwis97225da2002-11-24 23:05:09 +0000584 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000585 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000586 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000587 return -len(alist), aword
588 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000590 # figure out how many phrasebook escapes we need
591 escapes = 0
592 while escapes * 256 < len(wordlist):
593 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000594 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000595
596 short = 256 - escapes
597
598 assert short > 0
599
Collin Winter6afaeb72007-08-03 17:06:41 +0000600 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000601
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602 # statistics
603 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000604 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000605 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000606 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000607
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000608 # pick the most commonly used words, and sort the rest on falling
609 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000610
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000611 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000612 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613 wordlist.extend(wordtail)
614
615 # generate lexicon from words
616
617 lexicon_offset = [0]
618 lexicon = ""
619 words = {}
620
621 # build a lexicon string
622 offset = 0
623 for w, x in wordlist:
624 # encoding: bit 7 indicates last character in word (chr(128)
625 # indicates the last character in an entire string)
626 ww = w[:-1] + chr(ord(w[-1])+128)
627 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000628 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000629 if o < 0:
630 o = offset
631 lexicon = lexicon + ww
632 offset = offset + len(w)
633 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000634 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000636 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
638 # generate phrasebook from names and lexicon
639 phrasebook = [0]
640 phrasebook_offset = [0] * len(unicode.chars)
641 for char in unicode.chars:
642 name = names[char]
643 if name:
644 w = name.split()
645 phrasebook_offset[char] = len(phrasebook)
646 for w in w:
647 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000648 if i < short:
649 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000651 # store as two bytes
652 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000653 phrasebook.append(i&255)
654
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655 assert getsize(phrasebook) == 1
656
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000657 #
658 # unicode name hash table
659
660 # extract names
661 data = []
662 for char in unicode.chars:
663 record = unicode.table[char]
664 if record:
665 name = record[1].strip()
666 if name and name[0] != "<":
667 data.append((name, char))
668
669 # the magic number 47 was chosen to minimize the number of
670 # collisions on the current data set. if you like, change it
671 # and see what happens...
672
673 codehash = Hash("code", data, 47)
674
Collin Winter6afaeb72007-08-03 17:06:41 +0000675 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000676
677 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000678 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
679 print(file=fp)
680 print("#define NAME_MAXLEN", 256, file=fp)
681 print(file=fp)
682 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000683 Array("lexicon", lexicon).dump(fp, trace)
684 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685
686 # split decomposition index table
687 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
688
Collin Winter6afaeb72007-08-03 17:06:41 +0000689 print("/* code->name phrasebook */", file=fp)
690 print("#define phrasebook_shift", shift, file=fp)
691 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000693 Array("phrasebook", phrasebook).dump(fp, trace)
694 Array("phrasebook_offset1", offset1).dump(fp, trace)
695 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000696
Collin Winter6afaeb72007-08-03 17:06:41 +0000697 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000698 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000699
700 fp.close()
701
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702
703def merge_old_version(version, new, old):
704 # Changes to exclusion file not implemented yet
705 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000706 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000707
708 # In these change records, 0xFF means "no change"
709 bidir_changes = [0xFF]*0x110000
710 category_changes = [0xFF]*0x110000
711 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000712 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000713 # In numeric data, 0 means "no change",
714 # -1 means "did not have a numeric value
715 numeric_changes = [0] * 0x110000
716 # normalization_changes is a list of key-value pairs
717 normalization_changes = []
718 for i in range(0x110000):
719 if new.table[i] is None:
720 # Characters unassigned in the new version ought to
721 # be unassigned in the old one
722 assert old.table[i] is None
723 continue
724 # check characters unassigned in the old version
725 if old.table[i] is None:
726 # category 0 is "unassigned"
727 category_changes[i] = 0
728 continue
729 # check characters that differ
730 if old.table[i] != new.table[i]:
731 for k in range(len(old.table[i])):
732 if old.table[i][k] != new.table[i][k]:
733 value = old.table[i][k]
734 if k == 2:
735 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
736 category_changes[i] = CATEGORY_NAMES.index(value)
737 elif k == 4:
738 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
739 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
740 elif k == 5:
741 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
742 # We assume that all normalization changes are in 1:1 mappings
743 assert " " not in value
744 normalization_changes.append((i, value))
745 elif k == 6:
746 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
747 # we only support changes where the old value is a single digit
748 assert value in "0123456789"
749 decimal_changes[i] = int(value)
750 elif k == 8:
751 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
752 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000753 if not value:
754 numeric_changes[i] = -1
755 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000756 numeric_changes[i] = float(value)
757 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000758 elif k == 9:
759 if value == 'Y':
760 mirrored_changes[i] = '1'
761 else:
762 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000763 elif k == 11:
764 # change to ISO comment, ignore
765 pass
766 elif k == 12:
767 # change to simple uppercase mapping; ignore
768 pass
769 elif k == 13:
770 # change to simple lowercase mapping; ignore
771 pass
772 elif k == 14:
773 # change to simple titlecase mapping; ignore
774 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000775 elif k == 16:
776 # derived property changes; not yet
777 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000778 else:
779 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000780 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000781 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000782 decimal_changes, mirrored_changes,
783 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000784 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000785
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000786
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000787# --------------------------------------------------------------------
788# the following support code is taken from the unidb utilities
789# Copyright (c) 1999-2000 by Secret Labs AB
790
791# load a unicode-data file from disk
792
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000793class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000794 # Record structure:
795 # [ID, name, category, combining, bidi, decomp, (6)
796 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
797 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
798 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000799
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000800 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000801 derivedprops, derivednormalizationprops=None, linebreakprops=None,
802 expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000803 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000804 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000805 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000806 while 1:
807 s = file.readline()
808 if not s:
809 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000810 s = s.strip().split(";")
811 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000812 table[char] = s
813
Martin v. Löwis97225da2002-11-24 23:05:09 +0000814 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000815 if expand:
816 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000817 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000818 s = table[i]
819 if s:
820 if s[1][-6:] == "First>":
821 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000822 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000823 elif s[1][-5:] == "Last>":
824 s[1] = ""
825 field = None
826 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000827 f2 = field[:]
828 f2[0] = "%X" % i
829 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000830
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000831 # public attributes
832 self.filename = filename
833 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000834 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000835
Martin v. Löwis677bde22002-11-23 22:08:15 +0000836 file = open(exclusions)
837 self.exclusions = {}
838 for s in file:
839 s = s.strip()
840 if not s:
841 continue
842 if s[0] == '#':
843 continue
844 char = int(s.split()[0],16)
845 self.exclusions[char] = 1
846
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000847 widths = [None] * 0x110000
848 for s in open(eastasianwidth):
849 s = s.strip()
850 if not s:
851 continue
852 if s[0] == '#':
853 continue
854 s = s.split()[0].split(';')
855 if '..' in s[0]:
856 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000857 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000858 else:
859 chars = [int(s[0], 16)]
860 for char in chars:
861 widths[char] = s[1]
862 for i in range(0, 0x110000):
863 if table[i] is not None:
864 table[i].append(widths[i])
865
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000866 for i in range(0, 0x110000):
867 if table[i] is not None:
868 table[i].append(set())
869 for s in open(derivedprops):
870 s = s.split('#', 1)[0].strip()
871 if not s:
872 continue
873
874 r, p = s.split(";")
875 r = r.strip()
876 p = p.strip()
877 if ".." in r:
878 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000879 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000880 else:
881 chars = [int(r, 16)]
882 for char in chars:
883 if table[char]:
884 # Some properties (e.g. Default_Ignorable_Code_Point)
885 # apply to unassigned code points; ignore them
886 table[char][-1].add(p)
887
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000888 if linebreakprops:
889 for s in open(linebreakprops):
890 s = s.partition('#')[0]
891 s = [i.strip() for i in s.split(';')]
892 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
893 continue
894 if '..' not in s[0]:
895 first = last = int(s[0], 16)
896 else:
897 first, last = [int(c, 16) for c in s[0].split('..')]
898 for char in range(first, last+1):
899 table[char][-1].add('Line_Break')
900
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000901 if derivednormalizationprops:
902 quickchecks = [0] * 0x110000 # default is Yes
903 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
904 for s in open(derivednormalizationprops):
905 if '#' in s:
906 s = s[:s.index('#')]
907 s = [i.strip() for i in s.split(';')]
908 if len(s) < 2 or s[1] not in qc_order:
909 continue
910 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
911 quickcheck_shift = qc_order.index(s[1])*2
912 quickcheck <<= quickcheck_shift
913 if '..' not in s[0]:
914 first = last = int(s[0], 16)
915 else:
916 first, last = [int(c, 16) for c in s[0].split('..')]
917 for char in range(first, last+1):
918 assert not (quickchecks[char]>>quickcheck_shift)&3
919 quickchecks[char] |= quickcheck
920 for i in range(0, 0x110000):
921 if table[i] is not None:
922 table[i].append(quickchecks[i])
923
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000924 for line in open(unihan, encoding='utf-8'):
925 if not line.startswith('U+'):
926 continue
927 code, tag, value = line.split(None, 3)[:3]
928 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
929 'kOtherNumeric'):
930 continue
931 value = value.strip().replace(',', '')
932 i = int(code[2:], 16)
933 # Patch the numeric field
934 if table[i] is not None:
935 table[i][8] = value
936
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000937 def uselatin1(self):
938 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000939 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000940
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000941# hash table tools
942
943# this is a straight-forward reimplementation of Python's built-in
944# dictionary type, using a static data structure, and a custom string
945# hash algorithm.
946
947def myhash(s, magic):
948 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000949 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000950 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000951 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000952 if ix:
953 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
954 return h
955
956SIZES = [
957 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
958 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
959 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
960 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
961]
962
963class Hash:
964 def __init__(self, name, data, magic):
965 # turn a (key, value) list into a static hash table structure
966
967 # determine table size
968 for size, poly in SIZES:
969 if size > len(data):
970 poly = size + poly
971 break
972 else:
Collin Wintera817e582007-08-22 23:05:06 +0000973 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000974
Collin Winter6afaeb72007-08-03 17:06:41 +0000975 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000976
977 table = [None] * size
978
979 mask = size-1
980
981 n = 0
982
983 hash = myhash
984
985 # initialize hash table
986 for key, value in data:
987 h = hash(key, magic)
988 i = (~h) & mask
989 v = table[i]
990 if v is None:
991 table[i] = value
992 continue
993 incr = (h ^ (h >> 3)) & mask;
994 if not incr:
995 incr = mask
996 while 1:
997 n = n + 1
998 i = (i + incr) & mask
999 v = table[i]
1000 if v is None:
1001 table[i] = value
1002 break
1003 incr = incr << 1
1004 if incr > mask:
1005 incr = incr ^ poly
1006
Collin Winter6afaeb72007-08-03 17:06:41 +00001007 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001008 self.collisions = n
1009
1010 for i in range(len(table)):
1011 if table[i] is None:
1012 table[i] = 0
1013
1014 self.data = Array(name + "_hash", table)
1015 self.magic = magic
1016 self.name = name
1017 self.size = size
1018 self.poly = poly
1019
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001020 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001021 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001022 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001023 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1024 file.write("#define %s_size %d\n" % (self.name, self.size))
1025 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1026
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001027# stuff to deal with arrays of unsigned integers
1028
1029class Array:
1030
1031 def __init__(self, name, data):
1032 self.name = name
1033 self.data = data
1034
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001035 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001036 # write data to file, as a C array
1037 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001038 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001039 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001040 file.write("static ")
1041 if size == 1:
1042 file.write("unsigned char")
1043 elif size == 2:
1044 file.write("unsigned short")
1045 else:
1046 file.write("unsigned int")
1047 file.write(" " + self.name + "[] = {\n")
1048 if self.data:
1049 s = " "
1050 for item in self.data:
1051 i = str(item) + ", "
1052 if len(s) + len(i) > 78:
1053 file.write(s + "\n")
1054 s = " " + i
1055 else:
1056 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001057 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001058 file.write(s + "\n")
1059 file.write("};\n\n")
1060
1061def getsize(data):
1062 # return smallest possible integer size for the given array
1063 maxdata = max(data)
1064 if maxdata < 256:
1065 return 1
1066 elif maxdata < 65536:
1067 return 2
1068 else:
1069 return 4
1070
Tim Peters21013482000-09-25 07:13:41 +00001071def splitbins(t, trace=0):
1072 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1073
1074 t is a sequence of ints. This function can be useful to save space if
1075 many of the ints are the same. t1 and t2 are lists of ints, and shift
1076 is an int, chosen to minimize the combined size of t1 and t2 (in C
1077 code), and where for each i in range(len(t)),
1078 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1079 where mask is a bitmask isolating the last "shift" bits.
1080
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001081 If optional arg trace is non-zero (default zero), progress info
1082 is printed to sys.stderr. The higher the value, the more info
1083 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001084 """
1085
Tim Peters21013482000-09-25 07:13:41 +00001086 if trace:
1087 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001088 print("%d+%d bins at shift %d; %d bytes" % (
1089 len(t1), len(t2), shift, bytes), file=sys.stderr)
1090 print("Size of original table:", len(t)*getsize(t), \
1091 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001092 n = len(t)-1 # last valid index
1093 maxshift = 0 # the most we can shift n and still have something left
1094 if n > 0:
1095 while n >> 1:
1096 n >>= 1
1097 maxshift += 1
1098 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001099 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001100 t = tuple(t) # so slices can be dict keys
1101 for shift in range(maxshift + 1):
1102 t1 = []
1103 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001104 size = 2**shift
1105 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001106 for i in range(0, len(t), size):
1107 bin = t[i:i+size]
1108 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001109 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001110 index = len(t2)
1111 bincache[bin] = index
1112 t2.extend(bin)
1113 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001114 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001115 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001116 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001117 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001118 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001119 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001120 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001121 t1, t2, shift = best
1122 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001123 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001124 dump(t1, t2, shift, bytes)
1125 if __debug__:
1126 # exhaustively verify that the decomposition is correct
1127 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001128 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001129 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1130 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001131
1132if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001133 maketables(1)