blob: b2615eefcd1b06e43fd84824a5a635644305a1dd [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Fredrik Lundhcfcea492000-09-25 08:07:06 +000024#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000025# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000026#
27
28import sys
29
30SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000031VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000032
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000033# The Unicode Database
Florent Xiclunafaa663f2010-03-19 13:37:08 +000034UNIDATA_VERSION = "5.2.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000035UNICODE_DATA = "UnicodeData%s.txt"
36COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
37EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000038UNIHAN = "Unihan%s.txt"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000039DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000040DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000041LINE_BREAK = "LineBreak%s.txt"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042
43old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000044
45CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
46 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
47 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
48 "So" ]
49
50BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
51 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
52 "ON" ]
53
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000054EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
55
Florent Xicluna806d8cf2010-03-30 19:34:18 +000056MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
57
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000058# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000059ALPHA_MASK = 0x01
60DECIMAL_MASK = 0x02
61DIGIT_MASK = 0x04
62LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000063LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000064SPACE_MASK = 0x20
65TITLE_MASK = 0x40
66UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000067XID_START_MASK = 0x100
68XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000069PRINTABLE_MASK = 0x400
Martin v. Löwis93cbca32008-09-10 14:08:48 +000070NODELTA_MASK = 0x800
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000071NUMERIC_MASK = 0x1000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000072
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000073def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +000074
Collin Winter6afaeb72007-08-03 17:06:41 +000075 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000076
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077 version = ""
78 unicode = UnicodeData(UNICODE_DATA % version,
79 COMPOSITION_EXCLUSIONS % version,
Martin v. Löwis13c3e382007-08-14 22:37:03 +000080 EASTASIAN_WIDTH % version,
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000081 UNIHAN % version,
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000082 DERIVED_CORE_PROPERTIES % version,
Florent Xicluna806d8cf2010-03-30 19:34:18 +000083 DERIVEDNORMALIZATION_PROPS % version,
84 LINE_BREAK % version)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000085
Georg Brandl559e5d72008-06-11 18:37:52 +000086 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000087
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +000089 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
91 COMPOSITION_EXCLUSIONS % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000092 EASTASIAN_WIDTH % ("-"+version),
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000093 UNIHAN % ("-"+version),
Martin v. Löwis13c3e382007-08-14 22:37:03 +000094 DERIVED_CORE_PROPERTIES % ("-"+version))
Georg Brandl559e5d72008-06-11 18:37:52 +000095 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000096 merge_old_version(version, unicode, old_unicode)
97
Fredrik Lundhb2dfd732001-01-21 23:31:52 +000098 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000099 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000100 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000101
102# --------------------------------------------------------------------
103# unicode character properties
104
105def makeunicodedata(unicode, trace):
106
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000107 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108 table = [dummy]
109 cache = {0: dummy}
110 index = [0] * len(unicode.chars)
111
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000112 FILE = "Modules/unicodedata_db.h"
113
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000115
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000116 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000117
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000118 for char in unicode.chars:
119 record = unicode.table[char]
120 if record:
121 # extract database properties
122 category = CATEGORY_NAMES.index(record[2])
123 combining = int(record[3])
124 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
125 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000126 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000127 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000129 category, combining, bidirectional, mirrored, eastasianwidth,
130 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000131 )
132 # add entry to index and item tables
133 i = cache.get(item)
134 if i is None:
135 cache[item] = i = len(table)
136 table.append(item)
137 index[char] = i
138
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000139 # 2) decomposition data
140
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000141 decomp_data = [0]
142 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000144 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000145
Martin v. Löwis677bde22002-11-23 22:08:15 +0000146 comp_pairs = []
147 comp_first = [None] * len(unicode.chars)
148 comp_last = [None] * len(unicode.chars)
149
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000150 for char in unicode.chars:
151 record = unicode.table[char]
152 if record:
153 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000154 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000155 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000156 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 # prefix
158 if decomp[0][0] == "<":
159 prefix = decomp.pop(0)
160 else:
161 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000162 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000163 i = decomp_prefix.index(prefix)
164 except ValueError:
165 i = len(decomp_prefix)
166 decomp_prefix.append(prefix)
167 prefix = i
168 assert prefix < 256
169 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000170 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 # Collect NFC pairs
172 if not prefix and len(decomp) == 3 and \
173 char not in unicode.exclusions and \
174 unicode.table[decomp[1]][3] == "0":
175 p, l, r = decomp
176 comp_first[l] = 1
177 comp_last[r] = 1
178 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 try:
180 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000181 except ValueError:
182 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 decomp_data.extend(decomp)
184 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000185 else:
186 i = 0
187 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000188
Martin v. Löwis677bde22002-11-23 22:08:15 +0000189 f = l = 0
190 comp_first_ranges = []
191 comp_last_ranges = []
192 prev_f = prev_l = None
193 for i in unicode.chars:
194 if comp_first[i] is not None:
195 comp_first[i] = f
196 f += 1
197 if prev_f is None:
198 prev_f = (i,i)
199 elif prev_f[1]+1 == i:
200 prev_f = prev_f[0],i
201 else:
202 comp_first_ranges.append(prev_f)
203 prev_f = (i,i)
204 if comp_last[i] is not None:
205 comp_last[i] = l
206 l += 1
207 if prev_l is None:
208 prev_l = (i,i)
209 elif prev_l[1]+1 == i:
210 prev_l = prev_l[0],i
211 else:
212 comp_last_ranges.append(prev_l)
213 prev_l = (i,i)
214 comp_first_ranges.append(prev_f)
215 comp_last_ranges.append(prev_l)
216 total_first = f
217 total_last = l
218
219 comp_data = [0]*(total_first*total_last)
220 for f,l,char in comp_pairs:
221 f = comp_first[f]
222 l = comp_last[l]
223 comp_data[f*total_last+l] = char
224
Collin Winter6afaeb72007-08-03 17:06:41 +0000225 print(len(table), "unique properties")
226 print(len(decomp_prefix), "unique decomposition prefixes")
227 print(len(decomp_data), "unique decomposition entries:", end=' ')
228 print(decomp_size, "bytes")
229 print(total_first, "first characters in NFC")
230 print(total_last, "last characters in NFC")
231 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000232
Collin Winter6afaeb72007-08-03 17:06:41 +0000233 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000234
Fred Drake9c685052000-10-26 03:56:46 +0000235 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000236 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
237 print(file=fp)
238 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
239 print("/* a list of unique database records */", file=fp)
240 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000241 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000242 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000243 print("};", file=fp)
244 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print("/* Reindexing of NFC first characters. */", file=fp)
247 print("#define TOTAL_FIRST",total_first, file=fp)
248 print("#define TOTAL_LAST",total_last, file=fp)
249 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000250 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000251 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
253 print(" {0,0,0}", file=fp)
254 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000255 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000256 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
258 print(" {0,0,0}", file=fp)
259 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000260
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000261 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000262 # the support code moved into unicodedatabase.c
263
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("/* string literals */", file=fp)
265 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print(" \"%s\"," % name, file=fp)
268 print(" NULL", file=fp)
269 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" \"%s\"," % name, file=fp)
274 print(" NULL", file=fp)
275 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000276
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000278 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print(" \"%s\"," % name, file=fp)
280 print(" NULL", file=fp)
281 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000282
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print(" \"%s\"," % name, file=fp)
286 print(" NULL", file=fp)
287 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000288
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000289 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000290 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("/* index tables for the database records */", file=fp)
293 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000294 Array("index1", index1).dump(fp, trace)
295 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000297 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000298 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000299
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000301 Array("decomp_data", decomp_data).dump(fp, trace)
302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("/* index tables for the decomposition data */", file=fp)
304 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 Array("decomp_index1", index1).dump(fp, trace)
306 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000307
Martin v. Löwis677bde22002-11-23 22:08:15 +0000308 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print("/* NFC pairs */", file=fp)
310 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000311 Array("comp_index", index).dump(fp, trace)
312 Array("comp_data", index2).dump(fp, trace)
313
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 # Generate delta tables for old versions
315 for version, table, normalization in unicode.changed:
316 cversion = version.replace(".","_")
317 records = [table[0]]
318 cache = {table[0]:0}
319 index = [0] * len(table)
320 for i, record in enumerate(table):
321 try:
322 index[i] = cache[record]
323 except KeyError:
324 index[i] = cache[record] = len(records)
325 records.append(record)
326 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000327 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
330 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000331 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
332 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000333 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
334 print("{", file=fp)
335 print("\tint index;", file=fp)
336 print("\tif (n >= 0x110000) index = 0;", file=fp)
337 print("\telse {", file=fp)
338 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
339 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
340 (cversion, shift, ((1<<shift)-1)), file=fp)
341 print("\t}", file=fp)
342 print("\treturn change_records_%s+index;" % cversion, file=fp)
343 print("}\n", file=fp)
344 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
345 print("{", file=fp)
346 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
349 print("\tdefault: return 0;", file=fp)
350 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000351
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000352 fp.close()
353
354# --------------------------------------------------------------------
355# unicode character type tables
356
357def makeunicodetype(unicode, trace):
358
359 FILE = "Objects/unicodetype_db.h"
360
Collin Winter6afaeb72007-08-03 17:06:41 +0000361 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000362
363 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000364 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000365 table = [dummy]
366 cache = {0: dummy}
367 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000368 numeric = {}
369 spaces = []
370 linebreaks = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000371
372 for char in unicode.chars:
373 record = unicode.table[char]
374 if record:
375 # extract database properties
376 category = record[2]
377 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000378 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000379 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000380 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000381 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
382 flags |= ALPHA_MASK
383 if category == "Ll":
384 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000385 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000386 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000387 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388 if category == "Zs" or bidirectional in ("WS", "B", "S"):
389 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000390 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000391 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 flags |= TITLE_MASK
393 if category == "Lu":
394 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000395 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000396 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000397 if "XID_Start" in properties:
398 flags |= XID_START_MASK
399 if "XID_Continue" in properties:
400 flags |= XID_CONTINUE_MASK
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 # use delta predictor for upper/lower/title if it fits
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if record[12]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000403 upper = int(record[12], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000405 upper = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if record[13]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000407 lower = int(record[13], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000409 lower = char
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if record[14]:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000411 title = int(record[14], 16)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 else:
Walter Dörwald1b08b302009-04-25 14:13:56 +0000413 # UCD.html says that a missing title char means that
414 # it defaults to the uppercase character, not to the
415 # character itself. Apparently, in the current UCD (5.x)
416 # this feature is never used
417 title = upper
418 upper_d = upper - char
419 lower_d = lower - char
420 title_d = title - char
421 if -32768 <= upper_d <= 32767 and \
422 -32768 <= lower_d <= 32767 and \
423 -32768 <= title_d <= 32767:
424 # use deltas
425 upper = upper_d & 0xffff
426 lower = lower_d & 0xffff
427 title = title_d & 0xffff
428 else:
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000429 flags |= NODELTA_MASK
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000430 # decimal digit, integer digit
431 decimal = 0
432 if record[6]:
433 flags |= DECIMAL_MASK
434 decimal = int(record[6])
435 digit = 0
436 if record[7]:
437 flags |= DIGIT_MASK
438 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000439 if record[8]:
440 flags |= NUMERIC_MASK
441 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000443 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000444 )
445 # add entry to index and item tables
446 i = cache.get(item)
447 if i is None:
448 cache[item] = i = len(table)
449 table.append(item)
450 index[char] = i
451
Collin Winter6afaeb72007-08-03 17:06:41 +0000452 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000453 print(sum(map(len, numeric.values())), "numeric code points")
454 print(len(spaces), "whitespace code points")
455 print(len(linebreaks), "linebreak code points")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000456
Collin Winter6afaeb72007-08-03 17:06:41 +0000457 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000458
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000459 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000460 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
461 print(file=fp)
462 print("/* a list of unique character type descriptors */", file=fp)
463 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000464 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000465 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
466 print("};", file=fp)
467 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000468
469 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000470 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000471
Collin Winter6afaeb72007-08-03 17:06:41 +0000472 print("/* type indexes */", file=fp)
473 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000474 Array("index1", index1).dump(fp, trace)
475 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000476
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000477 # Generate code for _PyUnicode_ToNumeric()
478 numeric_items = sorted(numeric.items())
479 print('/* Returns the numeric value as double for Unicode characters', file=fp)
480 print(' * having this property, -1.0 otherwise.', file=fp)
481 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000482 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000483 print('{', file=fp)
484 print(' switch (ch) {', file=fp)
485 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000486 # Turn text into float literals
487 parts = value.split('/')
488 parts = [repr(float(part)) for part in parts]
489 value = '/'.join(parts)
490
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000491 codepoints.sort()
492 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000493 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000494 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000495 print(' }', file=fp)
496 print(' return -1.0;', file=fp)
497 print('}', file=fp)
498 print(file=fp)
499
500 # Generate code for _PyUnicode_IsWhitespace()
501 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
502 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
503 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000504 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000505 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000506 print(' switch (ch) {', file=fp)
507
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000508 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000509 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000510 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000511
512 print(' }', file=fp)
513 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000514 print('}', file=fp)
515 print(file=fp)
516
517 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000518 print("/* Returns 1 for Unicode characters having the line break", file=fp)
519 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
520 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000521 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000522 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000523 print('{', file=fp)
524 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000525 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000526 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000528
529 print(' }', file=fp)
530 print(' return 0;', file=fp)
531 print('}', file=fp)
532 print(file=fp)
533
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000534 fp.close()
535
536# --------------------------------------------------------------------
537# unicode name database
538
539def makeunicodename(unicode, trace):
540
541 FILE = "Modules/unicodename_db.h"
542
Collin Winter6afaeb72007-08-03 17:06:41 +0000543 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000544
545 # collect names
546 names = [None] * len(unicode.chars)
547
548 for char in unicode.chars:
549 record = unicode.table[char]
550 if record:
551 name = record[1].strip()
552 if name and name[0] != "<":
553 names[char] = name + chr(0)
554
Georg Brandl559e5d72008-06-11 18:37:52 +0000555 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000556
557 # collect unique words from names (note that we differ between
558 # words inside a sentence, and words ending a sentence. the
559 # latter includes the trailing null byte.
560
561 words = {}
562 n = b = 0
563 for char in unicode.chars:
564 name = names[char]
565 if name:
566 w = name.split()
567 b = b + len(name)
568 n = n + len(w)
569 for w in w:
570 l = words.get(w)
571 if l:
572 l.append(None)
573 else:
574 words[w] = [len(words)]
575
Collin Winter6afaeb72007-08-03 17:06:41 +0000576 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000577
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000578 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000579
Martin v. Löwis97225da2002-11-24 23:05:09 +0000580 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000581 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000582 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000583 return -len(alist), aword
584 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000585
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000586 # figure out how many phrasebook escapes we need
587 escapes = 0
588 while escapes * 256 < len(wordlist):
589 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000590 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000591
592 short = 256 - escapes
593
594 assert short > 0
595
Collin Winter6afaeb72007-08-03 17:06:41 +0000596 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000597
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000598 # statistics
599 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000600 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000602 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000603
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000604 # pick the most commonly used words, and sort the rest on falling
605 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000606
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000607 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000608 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000609 wordlist.extend(wordtail)
610
611 # generate lexicon from words
612
613 lexicon_offset = [0]
614 lexicon = ""
615 words = {}
616
617 # build a lexicon string
618 offset = 0
619 for w, x in wordlist:
620 # encoding: bit 7 indicates last character in word (chr(128)
621 # indicates the last character in an entire string)
622 ww = w[:-1] + chr(ord(w[-1])+128)
623 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000624 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000625 if o < 0:
626 o = offset
627 lexicon = lexicon + ww
628 offset = offset + len(w)
629 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000630 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000631
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000632 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000633
634 # generate phrasebook from names and lexicon
635 phrasebook = [0]
636 phrasebook_offset = [0] * len(unicode.chars)
637 for char in unicode.chars:
638 name = names[char]
639 if name:
640 w = name.split()
641 phrasebook_offset[char] = len(phrasebook)
642 for w in w:
643 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000644 if i < short:
645 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647 # store as two bytes
648 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000649 phrasebook.append(i&255)
650
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000651 assert getsize(phrasebook) == 1
652
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000653 #
654 # unicode name hash table
655
656 # extract names
657 data = []
658 for char in unicode.chars:
659 record = unicode.table[char]
660 if record:
661 name = record[1].strip()
662 if name and name[0] != "<":
663 data.append((name, char))
664
665 # the magic number 47 was chosen to minimize the number of
666 # collisions on the current data set. if you like, change it
667 # and see what happens...
668
669 codehash = Hash("code", data, 47)
670
Collin Winter6afaeb72007-08-03 17:06:41 +0000671 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000672
673 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000674 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
675 print(file=fp)
676 print("#define NAME_MAXLEN", 256, file=fp)
677 print(file=fp)
678 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000679 Array("lexicon", lexicon).dump(fp, trace)
680 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000681
682 # split decomposition index table
683 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
684
Collin Winter6afaeb72007-08-03 17:06:41 +0000685 print("/* code->name phrasebook */", file=fp)
686 print("#define phrasebook_shift", shift, file=fp)
687 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000689 Array("phrasebook", phrasebook).dump(fp, trace)
690 Array("phrasebook_offset1", offset1).dump(fp, trace)
691 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000692
Collin Winter6afaeb72007-08-03 17:06:41 +0000693 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000694 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000695
696 fp.close()
697
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000698
699def merge_old_version(version, new, old):
700 # Changes to exclusion file not implemented yet
701 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000702 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000703
704 # In these change records, 0xFF means "no change"
705 bidir_changes = [0xFF]*0x110000
706 category_changes = [0xFF]*0x110000
707 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000708 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000709 # In numeric data, 0 means "no change",
710 # -1 means "did not have a numeric value
711 numeric_changes = [0] * 0x110000
712 # normalization_changes is a list of key-value pairs
713 normalization_changes = []
714 for i in range(0x110000):
715 if new.table[i] is None:
716 # Characters unassigned in the new version ought to
717 # be unassigned in the old one
718 assert old.table[i] is None
719 continue
720 # check characters unassigned in the old version
721 if old.table[i] is None:
722 # category 0 is "unassigned"
723 category_changes[i] = 0
724 continue
725 # check characters that differ
726 if old.table[i] != new.table[i]:
727 for k in range(len(old.table[i])):
728 if old.table[i][k] != new.table[i][k]:
729 value = old.table[i][k]
730 if k == 2:
731 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
732 category_changes[i] = CATEGORY_NAMES.index(value)
733 elif k == 4:
734 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
735 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
736 elif k == 5:
737 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
738 # We assume that all normalization changes are in 1:1 mappings
739 assert " " not in value
740 normalization_changes.append((i, value))
741 elif k == 6:
742 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
743 # we only support changes where the old value is a single digit
744 assert value in "0123456789"
745 decimal_changes[i] = int(value)
746 elif k == 8:
747 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
748 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000749 if not value:
750 numeric_changes[i] = -1
751 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000752 numeric_changes[i] = float(value)
753 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000754 elif k == 9:
755 if value == 'Y':
756 mirrored_changes[i] = '1'
757 else:
758 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000759 elif k == 11:
760 # change to ISO comment, ignore
761 pass
762 elif k == 12:
763 # change to simple uppercase mapping; ignore
764 pass
765 elif k == 13:
766 # change to simple lowercase mapping; ignore
767 pass
768 elif k == 14:
769 # change to simple titlecase mapping; ignore
770 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000771 elif k == 16:
772 # derived property changes; not yet
773 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000774 else:
775 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000776 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000777 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000778 decimal_changes, mirrored_changes,
779 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000780 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000781
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000782
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000783# --------------------------------------------------------------------
784# the following support code is taken from the unidb utilities
785# Copyright (c) 1999-2000 by Secret Labs AB
786
787# load a unicode-data file from disk
788
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000789class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000790 # Record structure:
791 # [ID, name, category, combining, bidi, decomp, (6)
792 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
793 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
794 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000795
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000796 def __init__(self, filename, exclusions, eastasianwidth, unihan,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000797 derivedprops, derivednormalizationprops=None, linebreakprops=None,
798 expand=1):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799 self.changed = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000800 file = open(filename)
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000801 table = [None] * 0x110000
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000802 while 1:
803 s = file.readline()
804 if not s:
805 break
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000806 s = s.strip().split(";")
807 char = int(s[0], 16)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000808 table[char] = s
809
Martin v. Löwis97225da2002-11-24 23:05:09 +0000810 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000811 if expand:
812 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000813 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000814 s = table[i]
815 if s:
816 if s[1][-6:] == "First>":
817 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000818 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000819 elif s[1][-5:] == "Last>":
820 s[1] = ""
821 field = None
822 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000823 f2 = field[:]
824 f2[0] = "%X" % i
825 table[i] = f2
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000826
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000827 # public attributes
828 self.filename = filename
829 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000830 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000831
Martin v. Löwis677bde22002-11-23 22:08:15 +0000832 file = open(exclusions)
833 self.exclusions = {}
834 for s in file:
835 s = s.strip()
836 if not s:
837 continue
838 if s[0] == '#':
839 continue
840 char = int(s.split()[0],16)
841 self.exclusions[char] = 1
842
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000843 widths = [None] * 0x110000
844 for s in open(eastasianwidth):
845 s = s.strip()
846 if not s:
847 continue
848 if s[0] == '#':
849 continue
850 s = s.split()[0].split(';')
851 if '..' in s[0]:
852 first, last = [int(c, 16) for c in s[0].split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000853 chars = list(range(first, last+1))
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000854 else:
855 chars = [int(s[0], 16)]
856 for char in chars:
857 widths[char] = s[1]
858 for i in range(0, 0x110000):
859 if table[i] is not None:
860 table[i].append(widths[i])
861
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000862 for i in range(0, 0x110000):
863 if table[i] is not None:
864 table[i].append(set())
865 for s in open(derivedprops):
866 s = s.split('#', 1)[0].strip()
867 if not s:
868 continue
869
870 r, p = s.split(";")
871 r = r.strip()
872 p = p.strip()
873 if ".." in r:
874 first, last = [int(c, 16) for c in r.split('..')]
Georg Brandlbf82e372008-05-16 17:02:34 +0000875 chars = list(range(first, last+1))
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000876 else:
877 chars = [int(r, 16)]
878 for char in chars:
879 if table[char]:
880 # Some properties (e.g. Default_Ignorable_Code_Point)
881 # apply to unassigned code points; ignore them
882 table[char][-1].add(p)
883
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000884 if linebreakprops:
885 for s in open(linebreakprops):
886 s = s.partition('#')[0]
887 s = [i.strip() for i in s.split(';')]
888 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
889 continue
890 if '..' not in s[0]:
891 first = last = int(s[0], 16)
892 else:
893 first, last = [int(c, 16) for c in s[0].split('..')]
894 for char in range(first, last+1):
895 table[char][-1].add('Line_Break')
896
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000897 if derivednormalizationprops:
898 quickchecks = [0] * 0x110000 # default is Yes
899 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
900 for s in open(derivednormalizationprops):
901 if '#' in s:
902 s = s[:s.index('#')]
903 s = [i.strip() for i in s.split(';')]
904 if len(s) < 2 or s[1] not in qc_order:
905 continue
906 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
907 quickcheck_shift = qc_order.index(s[1])*2
908 quickcheck <<= quickcheck_shift
909 if '..' not in s[0]:
910 first = last = int(s[0], 16)
911 else:
912 first, last = [int(c, 16) for c in s[0].split('..')]
913 for char in range(first, last+1):
914 assert not (quickchecks[char]>>quickcheck_shift)&3
915 quickchecks[char] |= quickcheck
916 for i in range(0, 0x110000):
917 if table[i] is not None:
918 table[i].append(quickchecks[i])
919
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000920 for line in open(unihan, encoding='utf-8'):
921 if not line.startswith('U+'):
922 continue
923 code, tag, value = line.split(None, 3)[:3]
924 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
925 'kOtherNumeric'):
926 continue
927 value = value.strip().replace(',', '')
928 i = int(code[2:], 16)
929 # Patch the numeric field
930 if table[i] is not None:
931 table[i][8] = value
932
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000933 def uselatin1(self):
934 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +0000935 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000936
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000937# hash table tools
938
939# this is a straight-forward reimplementation of Python's built-in
940# dictionary type, using a static data structure, and a custom string
941# hash algorithm.
942
943def myhash(s, magic):
944 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000945 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000946 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000947 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000948 if ix:
949 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
950 return h
951
952SIZES = [
953 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
954 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
955 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
956 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
957]
958
959class Hash:
960 def __init__(self, name, data, magic):
961 # turn a (key, value) list into a static hash table structure
962
963 # determine table size
964 for size, poly in SIZES:
965 if size > len(data):
966 poly = size + poly
967 break
968 else:
Collin Wintera817e582007-08-22 23:05:06 +0000969 raise AssertionError("ran out of polynominals")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000970
Collin Winter6afaeb72007-08-03 17:06:41 +0000971 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000972
973 table = [None] * size
974
975 mask = size-1
976
977 n = 0
978
979 hash = myhash
980
981 # initialize hash table
982 for key, value in data:
983 h = hash(key, magic)
984 i = (~h) & mask
985 v = table[i]
986 if v is None:
987 table[i] = value
988 continue
989 incr = (h ^ (h >> 3)) & mask;
990 if not incr:
991 incr = mask
992 while 1:
993 n = n + 1
994 i = (i + incr) & mask
995 v = table[i]
996 if v is None:
997 table[i] = value
998 break
999 incr = incr << 1
1000 if incr > mask:
1001 incr = incr ^ poly
1002
Collin Winter6afaeb72007-08-03 17:06:41 +00001003 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001004 self.collisions = n
1005
1006 for i in range(len(table)):
1007 if table[i] is None:
1008 table[i] = 0
1009
1010 self.data = Array(name + "_hash", table)
1011 self.magic = magic
1012 self.name = name
1013 self.size = size
1014 self.poly = poly
1015
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001016 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001017 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001018 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001019 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1020 file.write("#define %s_size %d\n" % (self.name, self.size))
1021 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1022
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001023# stuff to deal with arrays of unsigned integers
1024
1025class Array:
1026
1027 def __init__(self, name, data):
1028 self.name = name
1029 self.data = data
1030
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001031 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001032 # write data to file, as a C array
1033 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001034 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001035 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001036 file.write("static ")
1037 if size == 1:
1038 file.write("unsigned char")
1039 elif size == 2:
1040 file.write("unsigned short")
1041 else:
1042 file.write("unsigned int")
1043 file.write(" " + self.name + "[] = {\n")
1044 if self.data:
1045 s = " "
1046 for item in self.data:
1047 i = str(item) + ", "
1048 if len(s) + len(i) > 78:
1049 file.write(s + "\n")
1050 s = " " + i
1051 else:
1052 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001053 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001054 file.write(s + "\n")
1055 file.write("};\n\n")
1056
1057def getsize(data):
1058 # return smallest possible integer size for the given array
1059 maxdata = max(data)
1060 if maxdata < 256:
1061 return 1
1062 elif maxdata < 65536:
1063 return 2
1064 else:
1065 return 4
1066
Tim Peters21013482000-09-25 07:13:41 +00001067def splitbins(t, trace=0):
1068 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1069
1070 t is a sequence of ints. This function can be useful to save space if
1071 many of the ints are the same. t1 and t2 are lists of ints, and shift
1072 is an int, chosen to minimize the combined size of t1 and t2 (in C
1073 code), and where for each i in range(len(t)),
1074 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1075 where mask is a bitmask isolating the last "shift" bits.
1076
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001077 If optional arg trace is non-zero (default zero), progress info
1078 is printed to sys.stderr. The higher the value, the more info
1079 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001080 """
1081
Tim Peters21013482000-09-25 07:13:41 +00001082 if trace:
1083 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001084 print("%d+%d bins at shift %d; %d bytes" % (
1085 len(t1), len(t2), shift, bytes), file=sys.stderr)
1086 print("Size of original table:", len(t)*getsize(t), \
1087 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001088 n = len(t)-1 # last valid index
1089 maxshift = 0 # the most we can shift n and still have something left
1090 if n > 0:
1091 while n >> 1:
1092 n >>= 1
1093 maxshift += 1
1094 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001095 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001096 t = tuple(t) # so slices can be dict keys
1097 for shift in range(maxshift + 1):
1098 t1 = []
1099 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001100 size = 2**shift
1101 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001102 for i in range(0, len(t), size):
1103 bin = t[i:i+size]
1104 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001105 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001106 index = len(t2)
1107 bincache[bin] = index
1108 t2.extend(bin)
1109 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001110 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001111 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001112 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001113 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001114 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001115 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001116 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001117 t1, t2, shift = best
1118 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001119 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001120 dump(t1, t2, shift, bytes)
1121 if __debug__:
1122 # exhaustively verify that the decomposition is correct
1123 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001124 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001125 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1126 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001127
1128if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001129 maketables(1)