blob: db0f8ecdd4559aa83b07bb21e2548993be311e85 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
35from operator import itemgetter
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000038VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
Benjamin Peterson71f660e2012-02-20 22:24:29 -050041UNIDATA_VERSION = "6.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042UNICODE_DATA = "UnicodeData%s.txt"
43COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
44EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000045UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000046DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000047DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000048LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030049NAME_ALIASES = "NameAliases%s.txt"
50NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050051SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050052CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053
54# Private Use Areas -- in planes 1, 15, 16
55PUA_1 = range(0xE000, 0xF900)
56PUA_15 = range(0xF0000, 0xFFFFE)
57PUA_16 = range(0x100000, 0x10FFFE)
58
59# we use this ranges of PUA_15 to store name aliases and named sequences
60NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050061NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000062
63old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064
65CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
66 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
67 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
68 "So" ]
69
70BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
71 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
72 "ON" ]
73
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000074EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
75
Florent Xicluna806d8cf2010-03-30 19:34:18 +000076MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
77
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000078# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000079ALPHA_MASK = 0x01
80DECIMAL_MASK = 0x02
81DIGIT_MASK = 0x04
82LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084SPACE_MASK = 0x20
85TITLE_MASK = 0x40
86UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000087XID_START_MASK = 0x100
88XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000089PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050090NUMERIC_MASK = 0x800
91CASE_IGNORABLE_MASK = 0x1000
92CASED_MASK = 0x2000
93EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000094
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095# these ranges need to match unicodedata.c:is_unified_ideograph
96cjk_ranges = [
97 ('3400', '4DB5'),
Benjamin Peterson71f660e2012-02-20 22:24:29 -050098 ('4E00', '9FCC'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099 ('20000', '2A6D6'),
100 ('2A700', '2B734'),
101 ('2B740', '2B81D')
102]
103
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000104def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105
Collin Winter6afaeb72007-08-03 17:06:41 +0000106 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000108 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000109 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110
Georg Brandl559e5d72008-06-11 18:37:52 +0000111 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000115 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000116 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000117 merge_old_version(version, unicode, old_unicode)
118
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000119 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000120 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000121 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000122
123# --------------------------------------------------------------------
124# unicode character properties
125
126def makeunicodedata(unicode, trace):
127
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000128 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000129 table = [dummy]
130 cache = {0: dummy}
131 index = [0] * len(unicode.chars)
132
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000133 FILE = "Modules/unicodedata_db.h"
134
Collin Winter6afaeb72007-08-03 17:06:41 +0000135 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000136
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000138
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 # extract database properties
143 category = CATEGORY_NAMES.index(record[2])
144 combining = int(record[3])
145 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
146 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000147 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000148 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000149 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000150 category, combining, bidirectional, mirrored, eastasianwidth,
151 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 )
153 # add entry to index and item tables
154 i = cache.get(item)
155 if i is None:
156 cache[item] = i = len(table)
157 table.append(item)
158 index[char] = i
159
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000160 # 2) decomposition data
161
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 decomp_data = [0]
163 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000164 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 comp_pairs = []
168 comp_first = [None] * len(unicode.chars)
169 comp_last = [None] * len(unicode.chars)
170
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 for char in unicode.chars:
172 record = unicode.table[char]
173 if record:
174 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000175 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000176 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000177 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 # prefix
179 if decomp[0][0] == "<":
180 prefix = decomp.pop(0)
181 else:
182 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000183 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 i = decomp_prefix.index(prefix)
185 except ValueError:
186 i = len(decomp_prefix)
187 decomp_prefix.append(prefix)
188 prefix = i
189 assert prefix < 256
190 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000191 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000192 # Collect NFC pairs
193 if not prefix and len(decomp) == 3 and \
194 char not in unicode.exclusions and \
195 unicode.table[decomp[1]][3] == "0":
196 p, l, r = decomp
197 comp_first[l] = 1
198 comp_last[r] = 1
199 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 try:
201 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000202 except ValueError:
203 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 decomp_data.extend(decomp)
205 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000206 else:
207 i = 0
208 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000209
Martin v. Löwis677bde22002-11-23 22:08:15 +0000210 f = l = 0
211 comp_first_ranges = []
212 comp_last_ranges = []
213 prev_f = prev_l = None
214 for i in unicode.chars:
215 if comp_first[i] is not None:
216 comp_first[i] = f
217 f += 1
218 if prev_f is None:
219 prev_f = (i,i)
220 elif prev_f[1]+1 == i:
221 prev_f = prev_f[0],i
222 else:
223 comp_first_ranges.append(prev_f)
224 prev_f = (i,i)
225 if comp_last[i] is not None:
226 comp_last[i] = l
227 l += 1
228 if prev_l is None:
229 prev_l = (i,i)
230 elif prev_l[1]+1 == i:
231 prev_l = prev_l[0],i
232 else:
233 comp_last_ranges.append(prev_l)
234 prev_l = (i,i)
235 comp_first_ranges.append(prev_f)
236 comp_last_ranges.append(prev_l)
237 total_first = f
238 total_last = l
239
240 comp_data = [0]*(total_first*total_last)
241 for f,l,char in comp_pairs:
242 f = comp_first[f]
243 l = comp_last[l]
244 comp_data[f*total_last+l] = char
245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print(len(table), "unique properties")
247 print(len(decomp_prefix), "unique decomposition prefixes")
248 print(len(decomp_data), "unique decomposition entries:", end=' ')
249 print(decomp_size, "bytes")
250 print(total_first, "first characters in NFC")
251 print(total_last, "last characters in NFC")
252 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000255
Fred Drake9c685052000-10-26 03:56:46 +0000256 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
258 print(file=fp)
259 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
260 print("/* a list of unique database records */", file=fp)
261 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000263 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("};", file=fp)
265 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("/* Reindexing of NFC first characters. */", file=fp)
268 print("#define TOTAL_FIRST",total_first, file=fp)
269 print("#define TOTAL_LAST",total_last, file=fp)
270 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000271 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000272 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
274 print(" {0,0,0}", file=fp)
275 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000276 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
279 print(" {0,0,0}", file=fp)
280 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000281
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000282 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283 # the support code moved into unicodedatabase.c
284
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print("/* string literals */", file=fp)
286 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print(" \"%s\"," % name, file=fp)
289 print(" NULL", file=fp)
290 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000294 print(" \"%s\"," % name, file=fp)
295 print(" NULL", file=fp)
296 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Collin Winter6afaeb72007-08-03 17:06:41 +0000298 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000299 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print(" \"%s\"," % name, file=fp)
301 print(" NULL", file=fp)
302 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000303
Collin Winter6afaeb72007-08-03 17:06:41 +0000304 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000306 print(" \"%s\"," % name, file=fp)
307 print(" NULL", file=fp)
308 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000309
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000310 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000311 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000312
Collin Winter6afaeb72007-08-03 17:06:41 +0000313 print("/* index tables for the database records */", file=fp)
314 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 Array("index1", index1).dump(fp, trace)
316 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000317
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000318 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000319 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000320
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000322 Array("decomp_data", decomp_data).dump(fp, trace)
323
Collin Winter6afaeb72007-08-03 17:06:41 +0000324 print("/* index tables for the decomposition data */", file=fp)
325 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000326 Array("decomp_index1", index1).dump(fp, trace)
327 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000328
Martin v. Löwis677bde22002-11-23 22:08:15 +0000329 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000330 print("/* NFC pairs */", file=fp)
331 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000332 Array("comp_index", index).dump(fp, trace)
333 Array("comp_data", index2).dump(fp, trace)
334
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 # Generate delta tables for old versions
336 for version, table, normalization in unicode.changed:
337 cversion = version.replace(".","_")
338 records = [table[0]]
339 cache = {table[0]:0}
340 index = [0] * len(table)
341 for i, record in enumerate(table):
342 try:
343 index[i] = cache[record]
344 except KeyError:
345 index[i] = cache[record] = len(records)
346 records.append(record)
347 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000349 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000350 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
351 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
353 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000354 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
355 print("{", file=fp)
356 print("\tint index;", file=fp)
357 print("\tif (n >= 0x110000) index = 0;", file=fp)
358 print("\telse {", file=fp)
359 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
360 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
361 (cversion, shift, ((1<<shift)-1)), file=fp)
362 print("\t}", file=fp)
363 print("\treturn change_records_%s+index;" % cversion, file=fp)
364 print("}\n", file=fp)
365 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
366 print("{", file=fp)
367 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000368 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000369 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
370 print("\tdefault: return 0;", file=fp)
371 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000372
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000373 fp.close()
374
375# --------------------------------------------------------------------
376# unicode character type tables
377
378def makeunicodetype(unicode, trace):
379
380 FILE = "Objects/unicodetype_db.h"
381
Collin Winter6afaeb72007-08-03 17:06:41 +0000382 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383
384 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000385 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 table = [dummy]
387 cache = {0: dummy}
388 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000389 numeric = {}
390 spaces = []
391 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500392 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393
394 for char in unicode.chars:
395 record = unicode.table[char]
396 if record:
397 # extract database properties
398 category = record[2]
399 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000400 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000402 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
404 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500405 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000407 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000408 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000409 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if category == "Zs" or bidirectional in ("WS", "B", "S"):
411 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000412 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000413 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500415 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000417 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000418 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000419 if "XID_Start" in properties:
420 flags |= XID_START_MASK
421 if "XID_Continue" in properties:
422 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Cased" in properties:
424 flags |= CASED_MASK
425 if "Case_Ignorable" in properties:
426 flags |= CASE_IGNORABLE_MASK
427 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500428 cf = unicode.case_folding.get(char, [char])
429 if record[12]:
430 upper = int(record[12], 16)
431 else:
432 upper = char
433 if record[13]:
434 lower = int(record[13], 16)
435 else:
436 lower = char
437 if record[14]:
438 title = int(record[14], 16)
439 else:
440 title = upper
441 if sc is None and cf != [lower]:
442 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500443 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500444 if upper == lower == title:
445 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500446 else:
447 upper = upper - char
448 lower = lower - char
449 title = title - char
450 assert (abs(upper) <= 2147483647 and
451 abs(lower) <= 2147483647 and
452 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000453 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500454 # This happens either when some character maps to more than one
455 # character in uppercase, lowercase, or titlecase or the
456 # casefolded version of the character is different from the
457 # lowercase. The extra characters are stored in a different
458 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500459 flags |= EXTENDED_CASE_MASK
460 lower = len(extra_casing) | (len(sc[0]) << 24)
461 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 if cf != sc[0]:
463 lower |= len(cf) << 20
464 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500465 upper = len(extra_casing) | (len(sc[2]) << 24)
466 extra_casing.extend(sc[2])
467 # Title is probably equal to upper.
468 if sc[1] == sc[2]:
469 title = upper
470 else:
471 title = len(extra_casing) | (len(sc[1]) << 24)
472 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000473 # decimal digit, integer digit
474 decimal = 0
475 if record[6]:
476 flags |= DECIMAL_MASK
477 decimal = int(record[6])
478 digit = 0
479 if record[7]:
480 flags |= DIGIT_MASK
481 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000482 if record[8]:
483 flags |= NUMERIC_MASK
484 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000485 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000486 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000487 )
488 # add entry to index and item tables
489 i = cache.get(item)
490 if i is None:
491 cache[item] = i = len(table)
492 table.append(item)
493 index[char] = i
494
Collin Winter6afaeb72007-08-03 17:06:41 +0000495 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000496 print(sum(map(len, numeric.values())), "numeric code points")
497 print(len(spaces), "whitespace code points")
498 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500499 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000500
Collin Winter6afaeb72007-08-03 17:06:41 +0000501 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000502
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000504 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
505 print(file=fp)
506 print("/* a list of unique character type descriptors */", file=fp)
507 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
510 print("};", file=fp)
511 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000512
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500513 print("/* extended case mappings */", file=fp)
514 print(file=fp)
515 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
516 for c in extra_casing:
517 print(" %d," % c, file=fp)
518 print("};", file=fp)
519 print(file=fp)
520
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000521 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000522 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000523
Collin Winter6afaeb72007-08-03 17:06:41 +0000524 print("/* type indexes */", file=fp)
525 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000526 Array("index1", index1).dump(fp, trace)
527 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000528
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000529 # Generate code for _PyUnicode_ToNumeric()
530 numeric_items = sorted(numeric.items())
531 print('/* Returns the numeric value as double for Unicode characters', file=fp)
532 print(' * having this property, -1.0 otherwise.', file=fp)
533 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000534 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000535 print('{', file=fp)
536 print(' switch (ch) {', file=fp)
537 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538 # Turn text into float literals
539 parts = value.split('/')
540 parts = [repr(float(part)) for part in parts]
541 value = '/'.join(parts)
542
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000543 codepoints.sort()
544 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000545 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000546 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000547 print(' }', file=fp)
548 print(' return -1.0;', file=fp)
549 print('}', file=fp)
550 print(file=fp)
551
552 # Generate code for _PyUnicode_IsWhitespace()
553 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
554 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
555 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000556 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000557 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000558 print(' switch (ch) {', file=fp)
559
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000560 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563
564 print(' }', file=fp)
565 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print('}', file=fp)
567 print(file=fp)
568
569 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000570 print("/* Returns 1 for Unicode characters having the line break", file=fp)
571 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
572 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000574 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000575 print('{', file=fp)
576 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000577 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000580
581 print(' }', file=fp)
582 print(' return 0;', file=fp)
583 print('}', file=fp)
584 print(file=fp)
585
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000586 fp.close()
587
588# --------------------------------------------------------------------
589# unicode name database
590
591def makeunicodename(unicode, trace):
592
593 FILE = "Modules/unicodename_db.h"
594
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597 # collect names
598 names = [None] * len(unicode.chars)
599
600 for char in unicode.chars:
601 record = unicode.table[char]
602 if record:
603 name = record[1].strip()
604 if name and name[0] != "<":
605 names[char] = name + chr(0)
606
Georg Brandl559e5d72008-06-11 18:37:52 +0000607 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608
609 # collect unique words from names (note that we differ between
610 # words inside a sentence, and words ending a sentence. the
611 # latter includes the trailing null byte.
612
613 words = {}
614 n = b = 0
615 for char in unicode.chars:
616 name = names[char]
617 if name:
618 w = name.split()
619 b = b + len(name)
620 n = n + len(w)
621 for w in w:
622 l = words.get(w)
623 if l:
624 l.append(None)
625 else:
626 words[w] = [len(words)]
627
Collin Winter6afaeb72007-08-03 17:06:41 +0000628 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000629
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000630 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000631
Martin v. Löwis97225da2002-11-24 23:05:09 +0000632 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000633 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000634 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000635 return -len(alist), aword
636 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000638 # figure out how many phrasebook escapes we need
639 escapes = 0
640 while escapes * 256 < len(wordlist):
641 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000642 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643
644 short = 256 - escapes
645
646 assert short > 0
647
Collin Winter6afaeb72007-08-03 17:06:41 +0000648 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650 # statistics
651 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000653 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000654 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000655
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000656 # pick the most commonly used words, and sort the rest on falling
657 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000660 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661 wordlist.extend(wordtail)
662
663 # generate lexicon from words
664
665 lexicon_offset = [0]
666 lexicon = ""
667 words = {}
668
669 # build a lexicon string
670 offset = 0
671 for w, x in wordlist:
672 # encoding: bit 7 indicates last character in word (chr(128)
673 # indicates the last character in an entire string)
674 ww = w[:-1] + chr(ord(w[-1])+128)
675 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000676 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000677 if o < 0:
678 o = offset
679 lexicon = lexicon + ww
680 offset = offset + len(w)
681 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000682 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000683
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000684 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685
686 # generate phrasebook from names and lexicon
687 phrasebook = [0]
688 phrasebook_offset = [0] * len(unicode.chars)
689 for char in unicode.chars:
690 name = names[char]
691 if name:
692 w = name.split()
693 phrasebook_offset[char] = len(phrasebook)
694 for w in w:
695 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000696 if i < short:
697 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000698 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000699 # store as two bytes
700 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701 phrasebook.append(i&255)
702
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000703 assert getsize(phrasebook) == 1
704
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000705 #
706 # unicode name hash table
707
708 # extract names
709 data = []
710 for char in unicode.chars:
711 record = unicode.table[char]
712 if record:
713 name = record[1].strip()
714 if name and name[0] != "<":
715 data.append((name, char))
716
717 # the magic number 47 was chosen to minimize the number of
718 # collisions on the current data set. if you like, change it
719 # and see what happens...
720
721 codehash = Hash("code", data, 47)
722
Collin Winter6afaeb72007-08-03 17:06:41 +0000723 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724
725 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000726 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
727 print(file=fp)
728 print("#define NAME_MAXLEN", 256, file=fp)
729 print(file=fp)
730 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000731 Array("lexicon", lexicon).dump(fp, trace)
732 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
734 # split decomposition index table
735 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
736
Collin Winter6afaeb72007-08-03 17:06:41 +0000737 print("/* code->name phrasebook */", file=fp)
738 print("#define phrasebook_shift", shift, file=fp)
739 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000740
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000741 Array("phrasebook", phrasebook).dump(fp, trace)
742 Array("phrasebook_offset1", offset1).dump(fp, trace)
743 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Collin Winter6afaeb72007-08-03 17:06:41 +0000745 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000746 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300748 print(file=fp)
749 print('static const unsigned int aliases_start = %#x;' %
750 NAME_ALIASES_START, file=fp)
751 print('static const unsigned int aliases_end = %#x;' %
752 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
753
754 print('static const unsigned int name_aliases[] = {', file=fp)
755 for name, codepoint in unicode.aliases:
756 print(' 0x%04X,' % codepoint, file=fp)
757 print('};', file=fp)
758
759 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
760 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
761 # sequences or sequences with non-BMP chars are added.
762 # unicodedata_lookup should be adapted too.
763 print(dedent("""
764 typedef struct NamedSequence {
765 int seqlen;
766 Py_UCS2 seq[4];
767 } named_sequence;
768 """), file=fp)
769
770 print('static const unsigned int named_sequences_start = %#x;' %
771 NAMED_SEQUENCES_START, file=fp)
772 print('static const unsigned int named_sequences_end = %#x;' %
773 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
774
775 print('static const named_sequence named_sequences[] = {', file=fp)
776 for name, sequence in unicode.named_sequences:
777 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
778 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
779 print('};', file=fp)
780
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000781 fp.close()
782
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000783
784def merge_old_version(version, new, old):
785 # Changes to exclusion file not implemented yet
786 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000787 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788
789 # In these change records, 0xFF means "no change"
790 bidir_changes = [0xFF]*0x110000
791 category_changes = [0xFF]*0x110000
792 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000793 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000794 # In numeric data, 0 means "no change",
795 # -1 means "did not have a numeric value
796 numeric_changes = [0] * 0x110000
797 # normalization_changes is a list of key-value pairs
798 normalization_changes = []
799 for i in range(0x110000):
800 if new.table[i] is None:
801 # Characters unassigned in the new version ought to
802 # be unassigned in the old one
803 assert old.table[i] is None
804 continue
805 # check characters unassigned in the old version
806 if old.table[i] is None:
807 # category 0 is "unassigned"
808 category_changes[i] = 0
809 continue
810 # check characters that differ
811 if old.table[i] != new.table[i]:
812 for k in range(len(old.table[i])):
813 if old.table[i][k] != new.table[i][k]:
814 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300815 if k == 1 and i in PUA_15:
816 # the name is not set in the old.table, but in the
817 # new.table we are using it for aliases and named seq
818 assert value == ''
819 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000820 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
821 category_changes[i] = CATEGORY_NAMES.index(value)
822 elif k == 4:
823 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
824 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
825 elif k == 5:
826 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
827 # We assume that all normalization changes are in 1:1 mappings
828 assert " " not in value
829 normalization_changes.append((i, value))
830 elif k == 6:
831 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
832 # we only support changes where the old value is a single digit
833 assert value in "0123456789"
834 decimal_changes[i] = int(value)
835 elif k == 8:
836 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
837 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000838 if not value:
839 numeric_changes[i] = -1
840 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000841 numeric_changes[i] = float(value)
842 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000843 elif k == 9:
844 if value == 'Y':
845 mirrored_changes[i] = '1'
846 else:
847 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 elif k == 11:
849 # change to ISO comment, ignore
850 pass
851 elif k == 12:
852 # change to simple uppercase mapping; ignore
853 pass
854 elif k == 13:
855 # change to simple lowercase mapping; ignore
856 pass
857 elif k == 14:
858 # change to simple titlecase mapping; ignore
859 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000860 elif k == 16:
861 # derived property changes; not yet
862 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000863 elif k == 17:
864 # normalization quickchecks are not performed
865 # for older versions
866 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000867 else:
868 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000869 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000870 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000871 decimal_changes, mirrored_changes,
872 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000873 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000874
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000875def open_data(template, version):
876 local = template % ('-'+version,)
877 if not os.path.exists(local):
878 import urllib.request
879 if version == '3.2.0':
880 # irregular url structure
881 url = 'http://www.unicode.org/Public/3.2-Update/' + local
882 else:
883 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
884 urllib.request.urlretrieve(url, filename=local)
885 if local.endswith('.txt'):
886 return open(local, encoding='utf-8')
887 else:
888 # Unihan.zip
889 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000890
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000891# --------------------------------------------------------------------
892# the following support code is taken from the unidb utilities
893# Copyright (c) 1999-2000 by Secret Labs AB
894
895# load a unicode-data file from disk
896
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000897class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000898 # Record structure:
899 # [ID, name, category, combining, bidi, decomp, (6)
900 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
901 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
902 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000903
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000904 def __init__(self, version,
905 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000906 expand=1,
907 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000908 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000909 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300910 with open_data(UNICODE_DATA, version) as file:
911 while 1:
912 s = file.readline()
913 if not s:
914 break
915 s = s.strip().split(";")
916 char = int(s[0], 16)
917 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000919 cjk_ranges_found = []
920
Martin v. Löwis97225da2002-11-24 23:05:09 +0000921 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000922 if expand:
923 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000924 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000925 s = table[i]
926 if s:
927 if s[1][-6:] == "First>":
928 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000929 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000930 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000931 if s[1].startswith("<CJK Ideograph"):
932 cjk_ranges_found.append((field[0],
933 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000934 s[1] = ""
935 field = None
936 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000937 f2 = field[:]
938 f2[0] = "%X" % i
939 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000940 if cjk_check and cjk_ranges != cjk_ranges_found:
941 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000942
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000943 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000944 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000945 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000946 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000947
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300948 # check for name aliases and named sequences, see #12753
949 # aliases and named sequences are not in 3.2.0
950 if version != '3.2.0':
951 self.aliases = []
952 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
953 # in order to take advantage of the compression and lookup
954 # algorithms used for the other characters
955 pua_index = NAME_ALIASES_START
956 with open_data(NAME_ALIASES, version) as file:
957 for s in file:
958 s = s.strip()
959 if not s or s.startswith('#'):
960 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500961 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300962 char = int(char, 16)
963 self.aliases.append((name, char))
964 # also store the name in the PUA 1
965 self.table[pua_index][1] = name
966 pua_index += 1
967 assert pua_index - NAME_ALIASES_START == len(self.aliases)
968
969 self.named_sequences = []
970 # store named seqences in the PUA 1, in range U+F0100..,
971 # in order to take advantage of the compression and lookup
972 # algorithms used for the other characters.
973
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500974 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300975 pua_index = NAMED_SEQUENCES_START
976 with open_data(NAMED_SEQUENCES, version) as file:
977 for s in file:
978 s = s.strip()
979 if not s or s.startswith('#'):
980 continue
981 name, chars = s.split(';')
982 chars = tuple(int(char, 16) for char in chars.split())
983 # check that the structure defined in makeunicodename is OK
984 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
985 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
986 "the NamedSequence struct and in unicodedata_lookup")
987 self.named_sequences.append((name, chars))
988 # also store these in the PUA 1
989 self.table[pua_index][1] = name
990 pua_index += 1
991 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
992
Martin v. Löwis677bde22002-11-23 22:08:15 +0000993 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300994 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
995 for s in file:
996 s = s.strip()
997 if not s:
998 continue
999 if s[0] == '#':
1000 continue
1001 char = int(s.split()[0],16)
1002 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001003
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001004 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001005 with open_data(EASTASIAN_WIDTH, version) as file:
1006 for s in file:
1007 s = s.strip()
1008 if not s:
1009 continue
1010 if s[0] == '#':
1011 continue
1012 s = s.split()[0].split(';')
1013 if '..' in s[0]:
1014 first, last = [int(c, 16) for c in s[0].split('..')]
1015 chars = list(range(first, last+1))
1016 else:
1017 chars = [int(s[0], 16)]
1018 for char in chars:
1019 widths[char] = s[1]
1020
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001021 for i in range(0, 0x110000):
1022 if table[i] is not None:
1023 table[i].append(widths[i])
1024
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001025 for i in range(0, 0x110000):
1026 if table[i] is not None:
1027 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001028
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001029 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1030 for s in file:
1031 s = s.split('#', 1)[0].strip()
1032 if not s:
1033 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001034
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001035 r, p = s.split(";")
1036 r = r.strip()
1037 p = p.strip()
1038 if ".." in r:
1039 first, last = [int(c, 16) for c in r.split('..')]
1040 chars = list(range(first, last+1))
1041 else:
1042 chars = [int(r, 16)]
1043 for char in chars:
1044 if table[char]:
1045 # Some properties (e.g. Default_Ignorable_Code_Point)
1046 # apply to unassigned code points; ignore them
1047 table[char][-1].add(p)
1048
1049 with open_data(LINE_BREAK, version) as file:
1050 for s in file:
1051 s = s.partition('#')[0]
1052 s = [i.strip() for i in s.split(';')]
1053 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1054 continue
1055 if '..' not in s[0]:
1056 first = last = int(s[0], 16)
1057 else:
1058 first, last = [int(c, 16) for c in s[0].split('..')]
1059 for char in range(first, last+1):
1060 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001061
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001062 # We only want the quickcheck properties
1063 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1064 # Yes is the default, hence only N and M occur
1065 # In 3.2.0, the format was different (NF?_NO)
1066 # The parsing will incorrectly determine these as
1067 # "yes", however, unicodedata.c will not perform quickchecks
1068 # for older versions, and no delta records will be created.
1069 quickchecks = [0] * 0x110000
1070 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001071 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1072 for s in file:
1073 if '#' in s:
1074 s = s[:s.index('#')]
1075 s = [i.strip() for i in s.split(';')]
1076 if len(s) < 2 or s[1] not in qc_order:
1077 continue
1078 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1079 quickcheck_shift = qc_order.index(s[1])*2
1080 quickcheck <<= quickcheck_shift
1081 if '..' not in s[0]:
1082 first = last = int(s[0], 16)
1083 else:
1084 first, last = [int(c, 16) for c in s[0].split('..')]
1085 for char in range(first, last+1):
1086 assert not (quickchecks[char]>>quickcheck_shift)&3
1087 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001088 for i in range(0, 0x110000):
1089 if table[i] is not None:
1090 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001091
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001092 with open_data(UNIHAN, version) as file:
1093 zip = zipfile.ZipFile(file)
1094 if version == '3.2.0':
1095 data = zip.open('Unihan-3.2.0.txt').read()
1096 else:
1097 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001098 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001099 if not line.startswith('U+'):
1100 continue
1101 code, tag, value = line.split(None, 3)[:3]
1102 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1103 'kOtherNumeric'):
1104 continue
1105 value = value.strip().replace(',', '')
1106 i = int(code[2:], 16)
1107 # Patch the numeric field
1108 if table[i] is not None:
1109 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001110 sc = self.special_casing = {}
1111 with open_data(SPECIAL_CASING, version) as file:
1112 for s in file:
1113 s = s[:-1].split('#', 1)[0]
1114 if not s:
1115 continue
1116 data = s.split("; ")
1117 if data[4]:
1118 # We ignore all conditionals (since they depend on
1119 # languages) except for one, which is hardcoded. See
1120 # handle_capital_sigma in unicodeobject.c.
1121 continue
1122 c = int(data[0], 16)
1123 lower = [int(char, 16) for char in data[1].split()]
1124 title = [int(char, 16) for char in data[2].split()]
1125 upper = [int(char, 16) for char in data[3].split()]
1126 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001127 cf = self.case_folding = {}
1128 if version != '3.2.0':
1129 with open_data(CASE_FOLDING, version) as file:
1130 for s in file:
1131 s = s[:-1].split('#', 1)[0]
1132 if not s:
1133 continue
1134 data = s.split("; ")
1135 if data[1] in "CF":
1136 c = int(data[0], 16)
1137 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001138
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001139 def uselatin1(self):
1140 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001141 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001142
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001143# hash table tools
1144
1145# this is a straight-forward reimplementation of Python's built-in
1146# dictionary type, using a static data structure, and a custom string
1147# hash algorithm.
1148
1149def myhash(s, magic):
1150 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001151 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001152 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001153 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001154 if ix:
1155 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1156 return h
1157
1158SIZES = [
1159 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1160 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1161 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1162 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1163]
1164
1165class Hash:
1166 def __init__(self, name, data, magic):
1167 # turn a (key, value) list into a static hash table structure
1168
1169 # determine table size
1170 for size, poly in SIZES:
1171 if size > len(data):
1172 poly = size + poly
1173 break
1174 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001175 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001176
Collin Winter6afaeb72007-08-03 17:06:41 +00001177 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001178
1179 table = [None] * size
1180
1181 mask = size-1
1182
1183 n = 0
1184
1185 hash = myhash
1186
1187 # initialize hash table
1188 for key, value in data:
1189 h = hash(key, magic)
1190 i = (~h) & mask
1191 v = table[i]
1192 if v is None:
1193 table[i] = value
1194 continue
1195 incr = (h ^ (h >> 3)) & mask;
1196 if not incr:
1197 incr = mask
1198 while 1:
1199 n = n + 1
1200 i = (i + incr) & mask
1201 v = table[i]
1202 if v is None:
1203 table[i] = value
1204 break
1205 incr = incr << 1
1206 if incr > mask:
1207 incr = incr ^ poly
1208
Collin Winter6afaeb72007-08-03 17:06:41 +00001209 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001210 self.collisions = n
1211
1212 for i in range(len(table)):
1213 if table[i] is None:
1214 table[i] = 0
1215
1216 self.data = Array(name + "_hash", table)
1217 self.magic = magic
1218 self.name = name
1219 self.size = size
1220 self.poly = poly
1221
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001222 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001223 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001224 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001225 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1226 file.write("#define %s_size %d\n" % (self.name, self.size))
1227 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1228
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001229# stuff to deal with arrays of unsigned integers
1230
1231class Array:
1232
1233 def __init__(self, name, data):
1234 self.name = name
1235 self.data = data
1236
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001237 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001238 # write data to file, as a C array
1239 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001240 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001241 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001242 file.write("static ")
1243 if size == 1:
1244 file.write("unsigned char")
1245 elif size == 2:
1246 file.write("unsigned short")
1247 else:
1248 file.write("unsigned int")
1249 file.write(" " + self.name + "[] = {\n")
1250 if self.data:
1251 s = " "
1252 for item in self.data:
1253 i = str(item) + ", "
1254 if len(s) + len(i) > 78:
1255 file.write(s + "\n")
1256 s = " " + i
1257 else:
1258 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001259 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001260 file.write(s + "\n")
1261 file.write("};\n\n")
1262
1263def getsize(data):
1264 # return smallest possible integer size for the given array
1265 maxdata = max(data)
1266 if maxdata < 256:
1267 return 1
1268 elif maxdata < 65536:
1269 return 2
1270 else:
1271 return 4
1272
Tim Peters21013482000-09-25 07:13:41 +00001273def splitbins(t, trace=0):
1274 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1275
1276 t is a sequence of ints. This function can be useful to save space if
1277 many of the ints are the same. t1 and t2 are lists of ints, and shift
1278 is an int, chosen to minimize the combined size of t1 and t2 (in C
1279 code), and where for each i in range(len(t)),
1280 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1281 where mask is a bitmask isolating the last "shift" bits.
1282
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001283 If optional arg trace is non-zero (default zero), progress info
1284 is printed to sys.stderr. The higher the value, the more info
1285 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001286 """
1287
Tim Peters21013482000-09-25 07:13:41 +00001288 if trace:
1289 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001290 print("%d+%d bins at shift %d; %d bytes" % (
1291 len(t1), len(t2), shift, bytes), file=sys.stderr)
1292 print("Size of original table:", len(t)*getsize(t), \
1293 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001294 n = len(t)-1 # last valid index
1295 maxshift = 0 # the most we can shift n and still have something left
1296 if n > 0:
1297 while n >> 1:
1298 n >>= 1
1299 maxshift += 1
1300 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001301 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001302 t = tuple(t) # so slices can be dict keys
1303 for shift in range(maxshift + 1):
1304 t1 = []
1305 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001306 size = 2**shift
1307 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001308 for i in range(0, len(t), size):
1309 bin = t[i:i+size]
1310 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001311 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001312 index = len(t2)
1313 bincache[bin] = index
1314 t2.extend(bin)
1315 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001316 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001317 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001318 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001319 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001320 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001321 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001322 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001323 t1, t2, shift = best
1324 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001325 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001326 dump(t1, t2, shift, bytes)
1327 if __debug__:
1328 # exhaustively verify that the decomposition is correct
1329 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001330 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001331 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1332 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001333
1334if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001335 maketables(1)