blob: f38b866d6c04bca017efa2915645e430e238db00 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Martin v. Löwis93cbca32008-09-10 14:08:48 +000031VERSION = "2.6"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Florent Xiclunafaa663f2010-03-19 13:37:08 +000034UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000073def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074
Collin Winter6afaeb72007-08-03 17:06:41 +000075 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000076
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077 version = ""
78 unicode = UnicodeData(UNICODE_DATA % version,
79 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000081 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000082 DERIVED_CORE_PROPERTIES % version,
Florent Xicluna806d8cf2010-03-30 19:34:18 +000083 DERIVEDNORMALIZATION_PROPS % version,
84 LINE_BREAK % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000085
Georg Brandl559e5d72008-06-11 18:37:52 +000086 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000087
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000089 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
91 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000093 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000094 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000095 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000096 merge_old_version(version, unicode, old_unicode)
97
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000098 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000100 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101
102# --------------------------------------------------------------------
103# unicode character properties
104
105def makeunicodedata(unicode, trace):
106
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000107 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 table = [dummy]
109 cache = {0: dummy}
110 index = [0] * len(unicode.chars)
111
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000112 FILE = "Modules/unicodedata_db.h"
113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000116 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000117
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 for char in unicode.chars:
119 record = unicode.table[char]
120 if record:
121 # extract database properties
122 category = CATEGORY_NAMES.index(record[2])
123 combining = int(record[3])
124 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
125 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000126 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000127 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000129 category, combining, bidirectional, mirrored, eastasianwidth,
130 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000131 )
132 # add entry to index and item tables
133 i = cache.get(item)
134 if i is None:
135 cache[item] = i = len(table)
136 table.append(item)
137 index[char] = i
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 # 2) decomposition data
140
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000141 decomp_data = [0]
142 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145
Martin v. Löwis677bde22002-11-23 22:08:15 +0000146 comp_pairs = []
147 comp_first = [None] * len(unicode.chars)
148 comp_last = [None] * len(unicode.chars)
149
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000150 for char in unicode.chars:
151 record = unicode.table[char]
152 if record:
153 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000154 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000155 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000156 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 # prefix
158 if decomp[0][0] == "<":
159 prefix = decomp.pop(0)
160 else:
161 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000162 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 i = decomp_prefix.index(prefix)
164 except ValueError:
165 i = len(decomp_prefix)
166 decomp_prefix.append(prefix)
167 prefix = i
168 assert prefix < 256
169 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000170 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 # Collect NFC pairs
172 if not prefix and len(decomp) == 3 and \
173 char not in unicode.exclusions and \
174 unicode.table[decomp[1]][3] == "0":
175 p, l, r = decomp
176 comp_first[l] = 1
177 comp_last[r] = 1
178 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 try:
180 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 except ValueError:
182 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 decomp_data.extend(decomp)
184 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000185 else:
186 i = 0
187 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000188
Martin v. Löwis677bde22002-11-23 22:08:15 +0000189 f = l = 0
190 comp_first_ranges = []
191 comp_last_ranges = []
192 prev_f = prev_l = None
193 for i in unicode.chars:
194 if comp_first[i] is not None:
195 comp_first[i] = f
196 f += 1
197 if prev_f is None:
198 prev_f = (i,i)
199 elif prev_f[1]+1 == i:
200 prev_f = prev_f[0],i
201 else:
202 comp_first_ranges.append(prev_f)
203 prev_f = (i,i)
204 if comp_last[i] is not None:
205 comp_last[i] = l
206 l += 1
207 if prev_l is None:
208 prev_l = (i,i)
209 elif prev_l[1]+1 == i:
210 prev_l = prev_l[0],i
211 else:
212 comp_last_ranges.append(prev_l)
213 prev_l = (i,i)
214 comp_first_ranges.append(prev_f)
215 comp_last_ranges.append(prev_l)
216 total_first = f
217 total_last = l
218
219 comp_data = [0]*(total_first*total_last)
220 for f,l,char in comp_pairs:
221 f = comp_first[f]
222 l = comp_last[l]
223 comp_data[f*total_last+l] = char
224
Collin Winter6afaeb72007-08-03 17:06:41 +0000225 print(len(table), "unique properties")
226 print(len(decomp_prefix), "unique decomposition prefixes")
227 print(len(decomp_data), "unique decomposition entries:", end=' ')
228 print(decomp_size, "bytes")
229 print(total_first, "first characters in NFC")
230 print(total_last, "last characters in NFC")
231 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000232
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000234
Fred Drake9c685052000-10-26 03:56:46 +0000235 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000236 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
237 print(file=fp)
238 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
239 print("/* a list of unique database records */", file=fp)
240 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000242 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000243 print("};", file=fp)
244 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print("/* Reindexing of NFC first characters. */", file=fp)
247 print("#define TOTAL_FIRST",total_first, file=fp)
248 print("#define TOTAL_LAST",total_last, file=fp)
249 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000250 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000251 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
253 print(" {0,0,0}", file=fp)
254 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000255 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
258 print(" {0,0,0}", file=fp)
259 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000260
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000261 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000262 # the support code moved into unicodedatabase.c
263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("/* string literals */", file=fp)
265 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print(" \"%s\"," % name, file=fp)
268 print(" NULL", file=fp)
269 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" \"%s\"," % name, file=fp)
274 print(" NULL", file=fp)
275 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000276
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print(" \"%s\"," % name, file=fp)
280 print(" NULL", file=fp)
281 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000282
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print(" \"%s\"," % name, file=fp)
286 print(" NULL", file=fp)
287 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000288
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000289 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000290 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("/* index tables for the database records */", file=fp)
293 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000294 Array("index1", index1).dump(fp, trace)
295 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000297 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000298 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000299
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_data", decomp_data).dump(fp, trace)
302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("/* index tables for the decomposition data */", file=fp)
304 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 Array("decomp_index1", index1).dump(fp, trace)
306 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000307
Martin v. Löwis677bde22002-11-23 22:08:15 +0000308 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print("/* NFC pairs */", file=fp)
310 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000311 Array("comp_index", index).dump(fp, trace)
312 Array("comp_data", index2).dump(fp, trace)
313
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 # Generate delta tables for old versions
315 for version, table, normalization in unicode.changed:
316 cversion = version.replace(".","_")
317 records = [table[0]]
318 cache = {table[0]:0}
319 index = [0] * len(table)
320 for i, record in enumerate(table):
321 try:
322 index[i] = cache[record]
323 except KeyError:
324 index[i] = cache[record] = len(records)
325 records.append(record)
326 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000327 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
330 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000331 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
332 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000333 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
334 print("{", file=fp)
335 print("\tint index;", file=fp)
336 print("\tif (n >= 0x110000) index = 0;", file=fp)
337 print("\telse {", file=fp)
338 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
339 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
340 (cversion, shift, ((1<<shift)-1)), file=fp)
341 print("\t}", file=fp)
342 print("\treturn change_records_%s+index;" % cversion, file=fp)
343 print("}\n", file=fp)
344 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
345 print("{", file=fp)
346 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
349 print("\tdefault: return 0;", file=fp)
350 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000351
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000352 fp.close()
353
354# --------------------------------------------------------------------
355# unicode character type tables
356
357def makeunicodetype(unicode, trace):
358
359 FILE = "Objects/unicodetype_db.h"
360
Collin Winter6afaeb72007-08-03 17:06:41 +0000361 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000364 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365 table = [dummy]
366 cache = {0: dummy}
367 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000368 numeric = {}
369 spaces = []
370 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371
372 for char in unicode.chars:
373 record = unicode.table[char]
374 if record:
375 # extract database properties
376 category = record[2]
377 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000378 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000380 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
382 flags |= ALPHA_MASK
383 if category == "Ll":
384 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000385 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000386 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000387 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 if category == "Zs" or bidirectional in ("WS", "B", "S"):
389 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000390 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000391 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 flags |= TITLE_MASK
393 if category == "Lu":
394 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000395 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000396 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000397 if "XID_Start" in properties:
398 flags |= XID_START_MASK
399 if "XID_Continue" in properties:
400 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000411 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000413 # UCD.html says that a missing title char means that
414 # it defaults to the uppercase character, not to the
415 # character itself. Apparently, in the current UCD (5.x)
416 # this feature is never used
417 title = upper
418 upper_d = upper - char
419 lower_d = lower - char
420 title_d = title - char
421 if -32768 <= upper_d <= 32767 and \
422 -32768 <= lower_d <= 32767 and \
423 -32768 <= title_d <= 32767:
424 # use deltas
425 upper = upper_d & 0xffff
426 lower = lower_d & 0xffff
427 title = title_d & 0xffff
428 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000429 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000430 # decimal digit, integer digit
431 decimal = 0
432 if record[6]:
433 flags |= DECIMAL_MASK
434 decimal = int(record[6])
435 digit = 0
436 if record[7]:
437 flags |= DIGIT_MASK
438 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000439 if record[8]:
440 flags |= NUMERIC_MASK
441 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000443 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000444 )
445 # add entry to index and item tables
446 i = cache.get(item)
447 if i is None:
448 cache[item] = i = len(table)
449 table.append(item)
450 index[char] = i
451
Collin Winter6afaeb72007-08-03 17:06:41 +0000452 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000453 print(sum(map(len, numeric.values())), "numeric code points")
454 print(len(spaces), "whitespace code points")
455 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000456
Collin Winter6afaeb72007-08-03 17:06:41 +0000457 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000458
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000459 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000460 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
461 print(file=fp)
462 print("/* a list of unique character type descriptors */", file=fp)
463 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000465 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
466 print("};", file=fp)
467 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000468
469 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000470 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000471
Collin Winter6afaeb72007-08-03 17:06:41 +0000472 print("/* type indexes */", file=fp)
473 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000474 Array("index1", index1).dump(fp, trace)
475 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000476
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000477 # Generate code for _PyUnicode_ToNumeric()
478 numeric_items = sorted(numeric.items())
479 print('/* Returns the numeric value as double for Unicode characters', file=fp)
480 print(' * having this property, -1.0 otherwise.', file=fp)
481 print(' */', file=fp)
482 print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
483 print('{', file=fp)
484 print(' switch (ch) {', file=fp)
485 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000486 # Turn text into float literals
487 parts = value.split('/')
488 parts = [repr(float(part)) for part in parts]
489 value = '/'.join(parts)
490
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000491 haswide = False
492 hasnonewide = False
493 codepoints.sort()
494 for codepoint in codepoints:
495 if codepoint < 0x10000:
496 hasnonewide = True
497 if codepoint >= 0x10000 and not haswide:
498 print('#ifdef Py_UNICODE_WIDE', file=fp)
499 haswide = True
500 print(' case 0x%04X:' % (codepoint,), file=fp)
501 if haswide and hasnonewide:
502 print('#endif', file=fp)
503 print(' return (double) %s;' % (value,), file=fp)
504 if haswide and not hasnonewide:
505 print('#endif', file=fp)
506 print(' }', file=fp)
507 print(' return -1.0;', file=fp)
508 print('}', file=fp)
509 print(file=fp)
510
511 # Generate code for _PyUnicode_IsWhitespace()
512 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
513 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
514 print(" */", file=fp)
515 print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
516 print('{', file=fp)
517 print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
518 print(' return iswspace(ch);', file=fp)
519 print('#else', file=fp)
520 print(' switch (ch) {', file=fp)
521
522 haswide = False
523 hasnonewide = False
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000524 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000525 if codepoint < 0x10000:
526 hasnonewide = True
527 if codepoint >= 0x10000 and not haswide:
528 print('#ifdef Py_UNICODE_WIDE', file=fp)
529 haswide = True
530 print(' case 0x%04X:' % (codepoint,), file=fp)
531 if haswide and hasnonewide:
532 print('#endif', file=fp)
533 print(' return 1;', file=fp)
534 if haswide and not hasnonewide:
535 print('#endif', file=fp)
536
537 print(' }', file=fp)
538 print(' return 0;', file=fp)
539 print('#endif', file=fp)
540 print('}', file=fp)
541 print(file=fp)
542
543 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000544 print("/* Returns 1 for Unicode characters having the line break", file=fp)
545 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
546 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000547 print(" */", file=fp)
548 print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
549 print('{', file=fp)
550 print(' switch (ch) {', file=fp)
551 haswide = False
552 hasnonewide = False
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000553 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000554 if codepoint < 0x10000:
555 hasnonewide = True
556 if codepoint >= 0x10000 and not haswide:
557 print('#ifdef Py_UNICODE_WIDE', file=fp)
558 haswide = True
559 print(' case 0x%04X:' % (codepoint,), file=fp)
560 if haswide and hasnonewide:
561 print('#endif', file=fp)
562 print(' return 1;', file=fp)
563 if haswide and not hasnonewide:
564 print('#endif', file=fp)
565
566 print(' }', file=fp)
567 print(' return 0;', file=fp)
568 print('}', file=fp)
569 print(file=fp)
570
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000571 fp.close()
572
573# --------------------------------------------------------------------
574# unicode name database
575
576def makeunicodename(unicode, trace):
577
578 FILE = "Modules/unicodename_db.h"
579
Collin Winter6afaeb72007-08-03 17:06:41 +0000580 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000581
582 # collect names
583 names = [None] * len(unicode.chars)
584
585 for char in unicode.chars:
586 record = unicode.table[char]
587 if record:
588 name = record[1].strip()
589 if name and name[0] != "<":
590 names[char] = name + chr(0)
591
Georg Brandl559e5d72008-06-11 18:37:52 +0000592 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000593
594 # collect unique words from names (note that we differ between
595 # words inside a sentence, and words ending a sentence. the
596 # latter includes the trailing null byte.
597
598 words = {}
599 n = b = 0
600 for char in unicode.chars:
601 name = names[char]
602 if name:
603 w = name.split()
604 b = b + len(name)
605 n = n + len(w)
606 for w in w:
607 l = words.get(w)
608 if l:
609 l.append(None)
610 else:
611 words[w] = [len(words)]
612
Collin Winter6afaeb72007-08-03 17:06:41 +0000613 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000614
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000615 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000616
Martin v. Löwis97225da2002-11-24 23:05:09 +0000617 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000618 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000619 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000620 return -len(alist), aword
621 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000623 # figure out how many phrasebook escapes we need
624 escapes = 0
625 while escapes * 256 < len(wordlist):
626 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000627 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000628
629 short = 256 - escapes
630
631 assert short > 0
632
Collin Winter6afaeb72007-08-03 17:06:41 +0000633 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000634
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635 # statistics
636 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000637 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000638 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000639 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000641 # pick the most commonly used words, and sort the rest on falling
642 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000643
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000644 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000645 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646 wordlist.extend(wordtail)
647
648 # generate lexicon from words
649
650 lexicon_offset = [0]
651 lexicon = ""
652 words = {}
653
654 # build a lexicon string
655 offset = 0
656 for w, x in wordlist:
657 # encoding: bit 7 indicates last character in word (chr(128)
658 # indicates the last character in an entire string)
659 ww = w[:-1] + chr(ord(w[-1])+128)
660 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000661 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662 if o < 0:
663 o = offset
664 lexicon = lexicon + ww
665 offset = offset + len(w)
666 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000667 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000668
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000669 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670
671 # generate phrasebook from names and lexicon
672 phrasebook = [0]
673 phrasebook_offset = [0] * len(unicode.chars)
674 for char in unicode.chars:
675 name = names[char]
676 if name:
677 w = name.split()
678 phrasebook_offset[char] = len(phrasebook)
679 for w in w:
680 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000681 if i < short:
682 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000683 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000684 # store as two bytes
685 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686 phrasebook.append(i&255)
687
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000688 assert getsize(phrasebook) == 1
689
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000690 #
691 # unicode name hash table
692
693 # extract names
694 data = []
695 for char in unicode.chars:
696 record = unicode.table[char]
697 if record:
698 name = record[1].strip()
699 if name and name[0] != "<":
700 data.append((name, char))
701
702 # the magic number 47 was chosen to minimize the number of
703 # collisions on the current data set. if you like, change it
704 # and see what happens...
705
706 codehash = Hash("code", data, 47)
707
Collin Winter6afaeb72007-08-03 17:06:41 +0000708 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000709
710 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000711 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
712 print(file=fp)
713 print("#define NAME_MAXLEN", 256, file=fp)
714 print(file=fp)
715 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000716 Array("lexicon", lexicon).dump(fp, trace)
717 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000718
719 # split decomposition index table
720 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
721
Collin Winter6afaeb72007-08-03 17:06:41 +0000722 print("/* code->name phrasebook */", file=fp)
723 print("#define phrasebook_shift", shift, file=fp)
724 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000725
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000726 Array("phrasebook", phrasebook).dump(fp, trace)
727 Array("phrasebook_offset1", offset1).dump(fp, trace)
728 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000729
Collin Winter6afaeb72007-08-03 17:06:41 +0000730 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000731 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000732
733 fp.close()
734
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735
736def merge_old_version(version, new, old):
737 # Changes to exclusion file not implemented yet
738 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000739 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000740
741 # In these change records, 0xFF means "no change"
742 bidir_changes = [0xFF]*0x110000
743 category_changes = [0xFF]*0x110000
744 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000745 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000746 # In numeric data, 0 means "no change",
747 # -1 means "did not have a numeric value
748 numeric_changes = [0] * 0x110000
749 # normalization_changes is a list of key-value pairs
750 normalization_changes = []
751 for i in range(0x110000):
752 if new.table[i] is None:
753 # Characters unassigned in the new version ought to
754 # be unassigned in the old one
755 assert old.table[i] is None
756 continue
757 # check characters unassigned in the old version
758 if old.table[i] is None:
759 # category 0 is "unassigned"
760 category_changes[i] = 0
761 continue
762 # check characters that differ
763 if old.table[i] != new.table[i]:
764 for k in range(len(old.table[i])):
765 if old.table[i][k] != new.table[i][k]:
766 value = old.table[i][k]
767 if k == 2:
768 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
769 category_changes[i] = CATEGORY_NAMES.index(value)
770 elif k == 4:
771 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
772 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
773 elif k == 5:
774 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
775 # We assume that all normalization changes are in 1:1 mappings
776 assert " " not in value
777 normalization_changes.append((i, value))
778 elif k == 6:
779 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
780 # we only support changes where the old value is a single digit
781 assert value in "0123456789"
782 decimal_changes[i] = int(value)
783 elif k == 8:
784 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
785 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000786 if not value:
787 numeric_changes[i] = -1
788 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000789 numeric_changes[i] = float(value)
790 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000791 elif k == 9:
792 if value == 'Y':
793 mirrored_changes[i] = '1'
794 else:
795 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000796 elif k == 11:
797 # change to ISO comment, ignore
798 pass
799 elif k == 12:
800 # change to simple uppercase mapping; ignore
801 pass
802 elif k == 13:
803 # change to simple lowercase mapping; ignore
804 pass
805 elif k == 14:
806 # change to simple titlecase mapping; ignore
807 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000808 elif k == 16:
809 # derived property changes; not yet
810 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000811 else:
812 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000813 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000814 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000815 decimal_changes, mirrored_changes,
816 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000817 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000818
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000819
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000820# --------------------------------------------------------------------
821# the following support code is taken from the unidb utilities
822# Copyright (c) 1999-2000 by Secret Labs AB
823
824# load a unicode-data file from disk
825
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000826class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000827 # Record structure:
828 # [ID, name, category, combining, bidi, decomp, (6)
829 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
830 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
831 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000832
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000833 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000834 derivedprops, derivednormalizationprops=None, linebreakprops=None,
835 expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000837 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000838 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000839 while 1:
840 s = file.readline()
841 if not s:
842 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000843 s = s.strip().split(";")
844 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000845 table[char] = s
846
Martin v. Löwis97225da2002-11-24 23:05:09 +0000847 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000848 if expand:
849 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000850 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000851 s = table[i]
852 if s:
853 if s[1][-6:] == "First>":
854 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000855 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000856 elif s[1][-5:] == "Last>":
857 s[1] = ""
858 field = None
859 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000860 f2 = field[:]
861 f2[0] = "%X" % i
862 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000863
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000864 # public attributes
865 self.filename = filename
866 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000867 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000868
Martin v. Löwis677bde22002-11-23 22:08:15 +0000869 file = open(exclusions)
870 self.exclusions = {}
871 for s in file:
872 s = s.strip()
873 if not s:
874 continue
875 if s[0] == '#':
876 continue
877 char = int(s.split()[0],16)
878 self.exclusions[char] = 1
879
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000880 widths = [None] * 0x110000
881 for s in open(eastasianwidth):
882 s = s.strip()
883 if not s:
884 continue
885 if s[0] == '#':
886 continue
887 s = s.split()[0].split(';')
888 if '..' in s[0]:
889 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000890 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000891 else:
892 chars = [int(s[0], 16)]
893 for char in chars:
894 widths[char] = s[1]
895 for i in range(0, 0x110000):
896 if table[i] is not None:
897 table[i].append(widths[i])
898
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000899 for i in range(0, 0x110000):
900 if table[i] is not None:
901 table[i].append(set())
902 for s in open(derivedprops):
903 s = s.split('#', 1)[0].strip()
904 if not s:
905 continue
906
907 r, p = s.split(";")
908 r = r.strip()
909 p = p.strip()
910 if ".." in r:
911 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000912 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000913 else:
914 chars = [int(r, 16)]
915 for char in chars:
916 if table[char]:
917 # Some properties (e.g. Default_Ignorable_Code_Point)
918 # apply to unassigned code points; ignore them
919 table[char][-1].add(p)
920
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000921 if linebreakprops:
922 for s in open(linebreakprops):
923 s = s.partition('#')[0]
924 s = [i.strip() for i in s.split(';')]
925 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
926 continue
927 if '..' not in s[0]:
928 first = last = int(s[0], 16)
929 else:
930 first, last = [int(c, 16) for c in s[0].split('..')]
931 for char in range(first, last+1):
932 table[char][-1].add('Line_Break')
933
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000934 if derivednormalizationprops:
935 quickchecks = [0] * 0x110000 # default is Yes
936 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
937 for s in open(derivednormalizationprops):
938 if '#' in s:
939 s = s[:s.index('#')]
940 s = [i.strip() for i in s.split(';')]
941 if len(s) < 2 or s[1] not in qc_order:
942 continue
943 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
944 quickcheck_shift = qc_order.index(s[1])*2
945 quickcheck <<= quickcheck_shift
946 if '..' not in s[0]:
947 first = last = int(s[0], 16)
948 else:
949 first, last = [int(c, 16) for c in s[0].split('..')]
950 for char in range(first, last+1):
951 assert not (quickchecks[char]>>quickcheck_shift)&3
952 quickchecks[char] |= quickcheck
953 for i in range(0, 0x110000):
954 if table[i] is not None:
955 table[i].append(quickchecks[i])
956
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000957 for line in open(unihan, encoding='utf-8'):
958 if not line.startswith('U+'):
959 continue
960 code, tag, value = line.split(None, 3)[:3]
961 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
962 'kOtherNumeric'):
963 continue
964 value = value.strip().replace(',', '')
965 i = int(code[2:], 16)
966 # Patch the numeric field
967 if table[i] is not None:
968 table[i][8] = value
969
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000970 def uselatin1(self):
971 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000972 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000973
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000974# hash table tools
975
976# this is a straight-forward reimplementation of Python's built-in
977# dictionary type, using a static data structure, and a custom string
978# hash algorithm.
979
980def myhash(s, magic):
981 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000982 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000983 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000984 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000985 if ix:
986 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
987 return h
988
989SIZES = [
990 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
991 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
992 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
993 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
994]
995
996class Hash:
997 def __init__(self, name, data, magic):
998 # turn a (key, value) list into a static hash table structure
999
1000 # determine table size
1001 for size, poly in SIZES:
1002 if size > len(data):
1003 poly = size + poly
1004 break
1005 else:
Collin Wintera817e582007-08-22 23:05:06 +00001006 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001007
Collin Winter6afaeb72007-08-03 17:06:41 +00001008 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001009
1010 table = [None] * size
1011
1012 mask = size-1
1013
1014 n = 0
1015
1016 hash = myhash
1017
1018 # initialize hash table
1019 for key, value in data:
1020 h = hash(key, magic)
1021 i = (~h) & mask
1022 v = table[i]
1023 if v is None:
1024 table[i] = value
1025 continue
1026 incr = (h ^ (h >> 3)) & mask;
1027 if not incr:
1028 incr = mask
1029 while 1:
1030 n = n + 1
1031 i = (i + incr) & mask
1032 v = table[i]
1033 if v is None:
1034 table[i] = value
1035 break
1036 incr = incr << 1
1037 if incr > mask:
1038 incr = incr ^ poly
1039
Collin Winter6afaeb72007-08-03 17:06:41 +00001040 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001041 self.collisions = n
1042
1043 for i in range(len(table)):
1044 if table[i] is None:
1045 table[i] = 0
1046
1047 self.data = Array(name + "_hash", table)
1048 self.magic = magic
1049 self.name = name
1050 self.size = size
1051 self.poly = poly
1052
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001053 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001054 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001055 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001056 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1057 file.write("#define %s_size %d\n" % (self.name, self.size))
1058 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1059
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001060# stuff to deal with arrays of unsigned integers
1061
1062class Array:
1063
1064 def __init__(self, name, data):
1065 self.name = name
1066 self.data = data
1067
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001068 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001069 # write data to file, as a C array
1070 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001071 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001072 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001073 file.write("static ")
1074 if size == 1:
1075 file.write("unsigned char")
1076 elif size == 2:
1077 file.write("unsigned short")
1078 else:
1079 file.write("unsigned int")
1080 file.write(" " + self.name + "[] = {\n")
1081 if self.data:
1082 s = " "
1083 for item in self.data:
1084 i = str(item) + ", "
1085 if len(s) + len(i) > 78:
1086 file.write(s + "\n")
1087 s = " " + i
1088 else:
1089 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001090 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001091 file.write(s + "\n")
1092 file.write("};\n\n")
1093
1094def getsize(data):
1095 # return smallest possible integer size for the given array
1096 maxdata = max(data)
1097 if maxdata < 256:
1098 return 1
1099 elif maxdata < 65536:
1100 return 2
1101 else:
1102 return 4
1103
Tim Peters21013482000-09-25 07:13:41 +00001104def splitbins(t, trace=0):
1105 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1106
1107 t is a sequence of ints. This function can be useful to save space if
1108 many of the ints are the same. t1 and t2 are lists of ints, and shift
1109 is an int, chosen to minimize the combined size of t1 and t2 (in C
1110 code), and where for each i in range(len(t)),
1111 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1112 where mask is a bitmask isolating the last "shift" bits.
1113
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001114 If optional arg trace is non-zero (default zero), progress info
1115 is printed to sys.stderr. The higher the value, the more info
1116 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001117 """
1118
Tim Peters21013482000-09-25 07:13:41 +00001119 if trace:
1120 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001121 print("%d+%d bins at shift %d; %d bytes" % (
1122 len(t1), len(t2), shift, bytes), file=sys.stderr)
1123 print("Size of original table:", len(t)*getsize(t), \
1124 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001125 n = len(t)-1 # last valid index
1126 maxshift = 0 # the most we can shift n and still have something left
1127 if n > 0:
1128 while n >> 1:
1129 n >>= 1
1130 maxshift += 1
1131 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001132 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001133 t = tuple(t) # so slices can be dict keys
1134 for shift in range(maxshift + 1):
1135 t1 = []
1136 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001137 size = 2**shift
1138 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001139 for i in range(0, len(t), size):
1140 bin = t[i:i+size]
1141 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001142 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001143 index = len(t2)
1144 bincache[bin] = index
1145 t2.extend(bin)
1146 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001147 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001148 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001149 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001150 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001151 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001152 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001153 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001154 t1, t2, shift = best
1155 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001156 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001157 dump(t1, t2, shift, bytes)
1158 if __debug__:
1159 # exhaustively verify that the decomposition is correct
1160 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001161 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001162 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1163 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001164
1165if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001166 maketables(1)