blob: 0795d9e72fff87a30b74924292ac60ecf5c85ea8 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
35from operator import itemgetter
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000038VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000041UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042UNICODE_DATA = "UnicodeData%s.txt"
43COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
44EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000045UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000046DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000047DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000048LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030049NAME_ALIASES = "NameAliases%s.txt"
50NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050051SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050052CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053
54# Private Use Areas -- in planes 1, 15, 16
55PUA_1 = range(0xE000, 0xF900)
56PUA_15 = range(0xF0000, 0xFFFFE)
57PUA_16 = range(0x100000, 0x10FFFE)
58
59# we use this ranges of PUA_15 to store name aliases and named sequences
60NAME_ALIASES_START = 0xF0000
61NAMED_SEQUENCES_START = 0xF0100
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000062
63old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064
65CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
66 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
67 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
68 "So" ]
69
70BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
71 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
72 "ON" ]
73
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000074EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
75
Florent Xicluna806d8cf2010-03-30 19:34:18 +000076MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
77
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000078# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000079ALPHA_MASK = 0x01
80DECIMAL_MASK = 0x02
81DIGIT_MASK = 0x04
82LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084SPACE_MASK = 0x20
85TITLE_MASK = 0x40
86UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000087XID_START_MASK = 0x100
88XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000089PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050090NUMERIC_MASK = 0x800
91CASE_IGNORABLE_MASK = 0x1000
92CASED_MASK = 0x2000
93EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000094
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095# these ranges need to match unicodedata.c:is_unified_ideograph
96cjk_ranges = [
97 ('3400', '4DB5'),
98 ('4E00', '9FCB'),
99 ('20000', '2A6D6'),
100 ('2A700', '2B734'),
101 ('2B740', '2B81D')
102]
103
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000104def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105
Collin Winter6afaeb72007-08-03 17:06:41 +0000106 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000108 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000109 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110
Georg Brandl559e5d72008-06-11 18:37:52 +0000111 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000115 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000116 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000117 merge_old_version(version, unicode, old_unicode)
118
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000119 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000120 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000121 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000122
123# --------------------------------------------------------------------
124# unicode character properties
125
126def makeunicodedata(unicode, trace):
127
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000128 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000129 table = [dummy]
130 cache = {0: dummy}
131 index = [0] * len(unicode.chars)
132
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000133 FILE = "Modules/unicodedata_db.h"
134
Collin Winter6afaeb72007-08-03 17:06:41 +0000135 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000136
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000138
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 # extract database properties
143 category = CATEGORY_NAMES.index(record[2])
144 combining = int(record[3])
145 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
146 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000147 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000148 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000149 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000150 category, combining, bidirectional, mirrored, eastasianwidth,
151 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 )
153 # add entry to index and item tables
154 i = cache.get(item)
155 if i is None:
156 cache[item] = i = len(table)
157 table.append(item)
158 index[char] = i
159
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000160 # 2) decomposition data
161
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 decomp_data = [0]
163 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000164 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 comp_pairs = []
168 comp_first = [None] * len(unicode.chars)
169 comp_last = [None] * len(unicode.chars)
170
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 for char in unicode.chars:
172 record = unicode.table[char]
173 if record:
174 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000175 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000176 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000177 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 # prefix
179 if decomp[0][0] == "<":
180 prefix = decomp.pop(0)
181 else:
182 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000183 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 i = decomp_prefix.index(prefix)
185 except ValueError:
186 i = len(decomp_prefix)
187 decomp_prefix.append(prefix)
188 prefix = i
189 assert prefix < 256
190 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000191 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000192 # Collect NFC pairs
193 if not prefix and len(decomp) == 3 and \
194 char not in unicode.exclusions and \
195 unicode.table[decomp[1]][3] == "0":
196 p, l, r = decomp
197 comp_first[l] = 1
198 comp_last[r] = 1
199 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 try:
201 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000202 except ValueError:
203 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 decomp_data.extend(decomp)
205 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000206 else:
207 i = 0
208 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000209
Martin v. Löwis677bde22002-11-23 22:08:15 +0000210 f = l = 0
211 comp_first_ranges = []
212 comp_last_ranges = []
213 prev_f = prev_l = None
214 for i in unicode.chars:
215 if comp_first[i] is not None:
216 comp_first[i] = f
217 f += 1
218 if prev_f is None:
219 prev_f = (i,i)
220 elif prev_f[1]+1 == i:
221 prev_f = prev_f[0],i
222 else:
223 comp_first_ranges.append(prev_f)
224 prev_f = (i,i)
225 if comp_last[i] is not None:
226 comp_last[i] = l
227 l += 1
228 if prev_l is None:
229 prev_l = (i,i)
230 elif prev_l[1]+1 == i:
231 prev_l = prev_l[0],i
232 else:
233 comp_last_ranges.append(prev_l)
234 prev_l = (i,i)
235 comp_first_ranges.append(prev_f)
236 comp_last_ranges.append(prev_l)
237 total_first = f
238 total_last = l
239
240 comp_data = [0]*(total_first*total_last)
241 for f,l,char in comp_pairs:
242 f = comp_first[f]
243 l = comp_last[l]
244 comp_data[f*total_last+l] = char
245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print(len(table), "unique properties")
247 print(len(decomp_prefix), "unique decomposition prefixes")
248 print(len(decomp_data), "unique decomposition entries:", end=' ')
249 print(decomp_size, "bytes")
250 print(total_first, "first characters in NFC")
251 print(total_last, "last characters in NFC")
252 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000255
Fred Drake9c685052000-10-26 03:56:46 +0000256 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
258 print(file=fp)
259 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
260 print("/* a list of unique database records */", file=fp)
261 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000263 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("};", file=fp)
265 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("/* Reindexing of NFC first characters. */", file=fp)
268 print("#define TOTAL_FIRST",total_first, file=fp)
269 print("#define TOTAL_LAST",total_last, file=fp)
270 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000271 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000272 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
274 print(" {0,0,0}", file=fp)
275 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000276 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
279 print(" {0,0,0}", file=fp)
280 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000281
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000282 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283 # the support code moved into unicodedatabase.c
284
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print("/* string literals */", file=fp)
286 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print(" \"%s\"," % name, file=fp)
289 print(" NULL", file=fp)
290 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000294 print(" \"%s\"," % name, file=fp)
295 print(" NULL", file=fp)
296 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Collin Winter6afaeb72007-08-03 17:06:41 +0000298 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000299 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print(" \"%s\"," % name, file=fp)
301 print(" NULL", file=fp)
302 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000303
Collin Winter6afaeb72007-08-03 17:06:41 +0000304 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000306 print(" \"%s\"," % name, file=fp)
307 print(" NULL", file=fp)
308 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000309
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000310 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000311 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000312
Collin Winter6afaeb72007-08-03 17:06:41 +0000313 print("/* index tables for the database records */", file=fp)
314 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 Array("index1", index1).dump(fp, trace)
316 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000317
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000318 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000319 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000320
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000322 Array("decomp_data", decomp_data).dump(fp, trace)
323
Collin Winter6afaeb72007-08-03 17:06:41 +0000324 print("/* index tables for the decomposition data */", file=fp)
325 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000326 Array("decomp_index1", index1).dump(fp, trace)
327 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000328
Martin v. Löwis677bde22002-11-23 22:08:15 +0000329 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000330 print("/* NFC pairs */", file=fp)
331 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000332 Array("comp_index", index).dump(fp, trace)
333 Array("comp_data", index2).dump(fp, trace)
334
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 # Generate delta tables for old versions
336 for version, table, normalization in unicode.changed:
337 cversion = version.replace(".","_")
338 records = [table[0]]
339 cache = {table[0]:0}
340 index = [0] * len(table)
341 for i, record in enumerate(table):
342 try:
343 index[i] = cache[record]
344 except KeyError:
345 index[i] = cache[record] = len(records)
346 records.append(record)
347 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000349 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000350 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
351 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
353 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000354 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
355 print("{", file=fp)
356 print("\tint index;", file=fp)
357 print("\tif (n >= 0x110000) index = 0;", file=fp)
358 print("\telse {", file=fp)
359 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
360 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
361 (cversion, shift, ((1<<shift)-1)), file=fp)
362 print("\t}", file=fp)
363 print("\treturn change_records_%s+index;" % cversion, file=fp)
364 print("}\n", file=fp)
365 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
366 print("{", file=fp)
367 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000368 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000369 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
370 print("\tdefault: return 0;", file=fp)
371 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000372
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000373 fp.close()
374
375# --------------------------------------------------------------------
376# unicode character type tables
377
378def makeunicodetype(unicode, trace):
379
380 FILE = "Objects/unicodetype_db.h"
381
Collin Winter6afaeb72007-08-03 17:06:41 +0000382 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383
384 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000385 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 table = [dummy]
387 cache = {0: dummy}
388 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000389 numeric = {}
390 spaces = []
391 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500392 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393
394 for char in unicode.chars:
395 record = unicode.table[char]
396 if record:
397 # extract database properties
398 category = record[2]
399 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000400 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000402 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
404 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500405 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000407 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000408 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000409 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if category == "Zs" or bidirectional in ("WS", "B", "S"):
411 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000412 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000413 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500415 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000417 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000418 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000419 if "XID_Start" in properties:
420 flags |= XID_START_MASK
421 if "XID_Continue" in properties:
422 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Cased" in properties:
424 flags |= CASED_MASK
425 if "Case_Ignorable" in properties:
426 flags |= CASE_IGNORABLE_MASK
427 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500428 cf = unicode.case_folding.get(char, [char])
429 if record[12]:
430 upper = int(record[12], 16)
431 else:
432 upper = char
433 if record[13]:
434 lower = int(record[13], 16)
435 else:
436 lower = char
437 if record[14]:
438 title = int(record[14], 16)
439 else:
440 title = upper
441 if sc is None and cf != [lower]:
442 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500443 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500444 if upper == lower == title:
445 upper = lower = title = 0
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000446 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500447 # This happens either when some character maps to more than one
448 # character in uppercase, lowercase, or titlecase or the
449 # casefolded version of the character is different from the
450 # lowercase. The extra characters are stored in a different
451 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500452 flags |= EXTENDED_CASE_MASK
453 lower = len(extra_casing) | (len(sc[0]) << 24)
454 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500455 if cf != sc[0]:
456 lower |= len(cf) << 20
457 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500458 upper = len(extra_casing) | (len(sc[2]) << 24)
459 extra_casing.extend(sc[2])
460 # Title is probably equal to upper.
461 if sc[1] == sc[2]:
462 title = upper
463 else:
464 title = len(extra_casing) | (len(sc[1]) << 24)
465 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000466 # decimal digit, integer digit
467 decimal = 0
468 if record[6]:
469 flags |= DECIMAL_MASK
470 decimal = int(record[6])
471 digit = 0
472 if record[7]:
473 flags |= DIGIT_MASK
474 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000475 if record[8]:
476 flags |= NUMERIC_MASK
477 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000478 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000479 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000480 )
481 # add entry to index and item tables
482 i = cache.get(item)
483 if i is None:
484 cache[item] = i = len(table)
485 table.append(item)
486 index[char] = i
487
Collin Winter6afaeb72007-08-03 17:06:41 +0000488 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000489 print(sum(map(len, numeric.values())), "numeric code points")
490 print(len(spaces), "whitespace code points")
491 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500492 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493
Collin Winter6afaeb72007-08-03 17:06:41 +0000494 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000495
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000496 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000497 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
498 print(file=fp)
499 print("/* a list of unique character type descriptors */", file=fp)
500 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000501 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000502 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
503 print("};", file=fp)
504 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000505
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500506 print("/* extended case mappings */", file=fp)
507 print(file=fp)
508 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
509 for c in extra_casing:
510 print(" %d," % c, file=fp)
511 print("};", file=fp)
512 print(file=fp)
513
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000514 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000515 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000516
Collin Winter6afaeb72007-08-03 17:06:41 +0000517 print("/* type indexes */", file=fp)
518 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000519 Array("index1", index1).dump(fp, trace)
520 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000521
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000522 # Generate code for _PyUnicode_ToNumeric()
523 numeric_items = sorted(numeric.items())
524 print('/* Returns the numeric value as double for Unicode characters', file=fp)
525 print(' * having this property, -1.0 otherwise.', file=fp)
526 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000527 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000528 print('{', file=fp)
529 print(' switch (ch) {', file=fp)
530 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000531 # Turn text into float literals
532 parts = value.split('/')
533 parts = [repr(float(part)) for part in parts]
534 value = '/'.join(parts)
535
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000536 codepoints.sort()
537 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000538 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000539 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000540 print(' }', file=fp)
541 print(' return -1.0;', file=fp)
542 print('}', file=fp)
543 print(file=fp)
544
545 # Generate code for _PyUnicode_IsWhitespace()
546 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
547 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
548 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000549 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551 print(' switch (ch) {', file=fp)
552
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000553 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000554 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000555 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000556
557 print(' }', file=fp)
558 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000559 print('}', file=fp)
560 print(file=fp)
561
562 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000563 print("/* Returns 1 for Unicode characters having the line break", file=fp)
564 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
565 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000567 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000568 print('{', file=fp)
569 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000570 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000571 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000572 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573
574 print(' }', file=fp)
575 print(' return 0;', file=fp)
576 print('}', file=fp)
577 print(file=fp)
578
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000579 fp.close()
580
581# --------------------------------------------------------------------
582# unicode name database
583
584def makeunicodename(unicode, trace):
585
586 FILE = "Modules/unicodename_db.h"
587
Collin Winter6afaeb72007-08-03 17:06:41 +0000588 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589
590 # collect names
591 names = [None] * len(unicode.chars)
592
593 for char in unicode.chars:
594 record = unicode.table[char]
595 if record:
596 name = record[1].strip()
597 if name and name[0] != "<":
598 names[char] = name + chr(0)
599
Georg Brandl559e5d72008-06-11 18:37:52 +0000600 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601
602 # collect unique words from names (note that we differ between
603 # words inside a sentence, and words ending a sentence. the
604 # latter includes the trailing null byte.
605
606 words = {}
607 n = b = 0
608 for char in unicode.chars:
609 name = names[char]
610 if name:
611 w = name.split()
612 b = b + len(name)
613 n = n + len(w)
614 for w in w:
615 l = words.get(w)
616 if l:
617 l.append(None)
618 else:
619 words[w] = [len(words)]
620
Collin Winter6afaeb72007-08-03 17:06:41 +0000621 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000622
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000623 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000624
Martin v. Löwis97225da2002-11-24 23:05:09 +0000625 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000626 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000627 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000628 return -len(alist), aword
629 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000631 # figure out how many phrasebook escapes we need
632 escapes = 0
633 while escapes * 256 < len(wordlist):
634 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000635 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000636
637 short = 256 - escapes
638
639 assert short > 0
640
Collin Winter6afaeb72007-08-03 17:06:41 +0000641 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000642
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000643 # statistics
644 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000645 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000646 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000647 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000648
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649 # pick the most commonly used words, and sort the rest on falling
650 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000651
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000653 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000654 wordlist.extend(wordtail)
655
656 # generate lexicon from words
657
658 lexicon_offset = [0]
659 lexicon = ""
660 words = {}
661
662 # build a lexicon string
663 offset = 0
664 for w, x in wordlist:
665 # encoding: bit 7 indicates last character in word (chr(128)
666 # indicates the last character in an entire string)
667 ww = w[:-1] + chr(ord(w[-1])+128)
668 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000669 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000670 if o < 0:
671 o = offset
672 lexicon = lexicon + ww
673 offset = offset + len(w)
674 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000675 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000676
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000677 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000678
679 # generate phrasebook from names and lexicon
680 phrasebook = [0]
681 phrasebook_offset = [0] * len(unicode.chars)
682 for char in unicode.chars:
683 name = names[char]
684 if name:
685 w = name.split()
686 phrasebook_offset[char] = len(phrasebook)
687 for w in w:
688 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000689 if i < short:
690 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000691 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000692 # store as two bytes
693 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000694 phrasebook.append(i&255)
695
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000696 assert getsize(phrasebook) == 1
697
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000698 #
699 # unicode name hash table
700
701 # extract names
702 data = []
703 for char in unicode.chars:
704 record = unicode.table[char]
705 if record:
706 name = record[1].strip()
707 if name and name[0] != "<":
708 data.append((name, char))
709
710 # the magic number 47 was chosen to minimize the number of
711 # collisions on the current data set. if you like, change it
712 # and see what happens...
713
714 codehash = Hash("code", data, 47)
715
Collin Winter6afaeb72007-08-03 17:06:41 +0000716 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000717
718 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000719 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
720 print(file=fp)
721 print("#define NAME_MAXLEN", 256, file=fp)
722 print(file=fp)
723 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000724 Array("lexicon", lexicon).dump(fp, trace)
725 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000726
727 # split decomposition index table
728 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
729
Collin Winter6afaeb72007-08-03 17:06:41 +0000730 print("/* code->name phrasebook */", file=fp)
731 print("#define phrasebook_shift", shift, file=fp)
732 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000734 Array("phrasebook", phrasebook).dump(fp, trace)
735 Array("phrasebook_offset1", offset1).dump(fp, trace)
736 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000737
Collin Winter6afaeb72007-08-03 17:06:41 +0000738 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000739 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000740
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300741 print(file=fp)
742 print('static const unsigned int aliases_start = %#x;' %
743 NAME_ALIASES_START, file=fp)
744 print('static const unsigned int aliases_end = %#x;' %
745 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
746
747 print('static const unsigned int name_aliases[] = {', file=fp)
748 for name, codepoint in unicode.aliases:
749 print(' 0x%04X,' % codepoint, file=fp)
750 print('};', file=fp)
751
752 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
753 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
754 # sequences or sequences with non-BMP chars are added.
755 # unicodedata_lookup should be adapted too.
756 print(dedent("""
757 typedef struct NamedSequence {
758 int seqlen;
759 Py_UCS2 seq[4];
760 } named_sequence;
761 """), file=fp)
762
763 print('static const unsigned int named_sequences_start = %#x;' %
764 NAMED_SEQUENCES_START, file=fp)
765 print('static const unsigned int named_sequences_end = %#x;' %
766 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
767
768 print('static const named_sequence named_sequences[] = {', file=fp)
769 for name, sequence in unicode.named_sequences:
770 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
771 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
772 print('};', file=fp)
773
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000774 fp.close()
775
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000776
777def merge_old_version(version, new, old):
778 # Changes to exclusion file not implemented yet
779 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000780 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000781
782 # In these change records, 0xFF means "no change"
783 bidir_changes = [0xFF]*0x110000
784 category_changes = [0xFF]*0x110000
785 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000786 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000787 # In numeric data, 0 means "no change",
788 # -1 means "did not have a numeric value
789 numeric_changes = [0] * 0x110000
790 # normalization_changes is a list of key-value pairs
791 normalization_changes = []
792 for i in range(0x110000):
793 if new.table[i] is None:
794 # Characters unassigned in the new version ought to
795 # be unassigned in the old one
796 assert old.table[i] is None
797 continue
798 # check characters unassigned in the old version
799 if old.table[i] is None:
800 # category 0 is "unassigned"
801 category_changes[i] = 0
802 continue
803 # check characters that differ
804 if old.table[i] != new.table[i]:
805 for k in range(len(old.table[i])):
806 if old.table[i][k] != new.table[i][k]:
807 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300808 if k == 1 and i in PUA_15:
809 # the name is not set in the old.table, but in the
810 # new.table we are using it for aliases and named seq
811 assert value == ''
812 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000813 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
814 category_changes[i] = CATEGORY_NAMES.index(value)
815 elif k == 4:
816 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
817 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
818 elif k == 5:
819 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
820 # We assume that all normalization changes are in 1:1 mappings
821 assert " " not in value
822 normalization_changes.append((i, value))
823 elif k == 6:
824 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
825 # we only support changes where the old value is a single digit
826 assert value in "0123456789"
827 decimal_changes[i] = int(value)
828 elif k == 8:
829 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
830 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000831 if not value:
832 numeric_changes[i] = -1
833 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000834 numeric_changes[i] = float(value)
835 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000836 elif k == 9:
837 if value == 'Y':
838 mirrored_changes[i] = '1'
839 else:
840 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000841 elif k == 11:
842 # change to ISO comment, ignore
843 pass
844 elif k == 12:
845 # change to simple uppercase mapping; ignore
846 pass
847 elif k == 13:
848 # change to simple lowercase mapping; ignore
849 pass
850 elif k == 14:
851 # change to simple titlecase mapping; ignore
852 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000853 elif k == 16:
854 # derived property changes; not yet
855 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000856 elif k == 17:
857 # normalization quickchecks are not performed
858 # for older versions
859 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000860 else:
861 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000862 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000863 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000864 decimal_changes, mirrored_changes,
865 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000866 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000867
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000868def open_data(template, version):
869 local = template % ('-'+version,)
870 if not os.path.exists(local):
871 import urllib.request
872 if version == '3.2.0':
873 # irregular url structure
874 url = 'http://www.unicode.org/Public/3.2-Update/' + local
875 else:
876 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
877 urllib.request.urlretrieve(url, filename=local)
878 if local.endswith('.txt'):
879 return open(local, encoding='utf-8')
880 else:
881 # Unihan.zip
882 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000883
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000884# --------------------------------------------------------------------
885# the following support code is taken from the unidb utilities
886# Copyright (c) 1999-2000 by Secret Labs AB
887
888# load a unicode-data file from disk
889
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000890class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000891 # Record structure:
892 # [ID, name, category, combining, bidi, decomp, (6)
893 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
894 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
895 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000896
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000897 def __init__(self, version,
898 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000899 expand=1,
900 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000901 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000902 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300903 with open_data(UNICODE_DATA, version) as file:
904 while 1:
905 s = file.readline()
906 if not s:
907 break
908 s = s.strip().split(";")
909 char = int(s[0], 16)
910 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000911
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000912 cjk_ranges_found = []
913
Martin v. Löwis97225da2002-11-24 23:05:09 +0000914 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000915 if expand:
916 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000917 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000918 s = table[i]
919 if s:
920 if s[1][-6:] == "First>":
921 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000922 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000923 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000924 if s[1].startswith("<CJK Ideograph"):
925 cjk_ranges_found.append((field[0],
926 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000927 s[1] = ""
928 field = None
929 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000930 f2 = field[:]
931 f2[0] = "%X" % i
932 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000933 if cjk_check and cjk_ranges != cjk_ranges_found:
934 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000935
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000936 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000937 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000938 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000939 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000940
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300941 # check for name aliases and named sequences, see #12753
942 # aliases and named sequences are not in 3.2.0
943 if version != '3.2.0':
944 self.aliases = []
945 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
946 # in order to take advantage of the compression and lookup
947 # algorithms used for the other characters
948 pua_index = NAME_ALIASES_START
949 with open_data(NAME_ALIASES, version) as file:
950 for s in file:
951 s = s.strip()
952 if not s or s.startswith('#'):
953 continue
954 char, name = s.split(';')
955 char = int(char, 16)
956 self.aliases.append((name, char))
957 # also store the name in the PUA 1
958 self.table[pua_index][1] = name
959 pua_index += 1
960 assert pua_index - NAME_ALIASES_START == len(self.aliases)
961
962 self.named_sequences = []
963 # store named seqences in the PUA 1, in range U+F0100..,
964 # in order to take advantage of the compression and lookup
965 # algorithms used for the other characters.
966
967 pua_index = NAMED_SEQUENCES_START
968 with open_data(NAMED_SEQUENCES, version) as file:
969 for s in file:
970 s = s.strip()
971 if not s or s.startswith('#'):
972 continue
973 name, chars = s.split(';')
974 chars = tuple(int(char, 16) for char in chars.split())
975 # check that the structure defined in makeunicodename is OK
976 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
977 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
978 "the NamedSequence struct and in unicodedata_lookup")
979 self.named_sequences.append((name, chars))
980 # also store these in the PUA 1
981 self.table[pua_index][1] = name
982 pua_index += 1
983 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
984
Martin v. Löwis677bde22002-11-23 22:08:15 +0000985 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300986 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
987 for s in file:
988 s = s.strip()
989 if not s:
990 continue
991 if s[0] == '#':
992 continue
993 char = int(s.split()[0],16)
994 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000995
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000996 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300997 with open_data(EASTASIAN_WIDTH, version) as file:
998 for s in file:
999 s = s.strip()
1000 if not s:
1001 continue
1002 if s[0] == '#':
1003 continue
1004 s = s.split()[0].split(';')
1005 if '..' in s[0]:
1006 first, last = [int(c, 16) for c in s[0].split('..')]
1007 chars = list(range(first, last+1))
1008 else:
1009 chars = [int(s[0], 16)]
1010 for char in chars:
1011 widths[char] = s[1]
1012
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001013 for i in range(0, 0x110000):
1014 if table[i] is not None:
1015 table[i].append(widths[i])
1016
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001017 for i in range(0, 0x110000):
1018 if table[i] is not None:
1019 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001020
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001021 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1022 for s in file:
1023 s = s.split('#', 1)[0].strip()
1024 if not s:
1025 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001026
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001027 r, p = s.split(";")
1028 r = r.strip()
1029 p = p.strip()
1030 if ".." in r:
1031 first, last = [int(c, 16) for c in r.split('..')]
1032 chars = list(range(first, last+1))
1033 else:
1034 chars = [int(r, 16)]
1035 for char in chars:
1036 if table[char]:
1037 # Some properties (e.g. Default_Ignorable_Code_Point)
1038 # apply to unassigned code points; ignore them
1039 table[char][-1].add(p)
1040
1041 with open_data(LINE_BREAK, version) as file:
1042 for s in file:
1043 s = s.partition('#')[0]
1044 s = [i.strip() for i in s.split(';')]
1045 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1046 continue
1047 if '..' not in s[0]:
1048 first = last = int(s[0], 16)
1049 else:
1050 first, last = [int(c, 16) for c in s[0].split('..')]
1051 for char in range(first, last+1):
1052 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001053
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001054 # We only want the quickcheck properties
1055 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1056 # Yes is the default, hence only N and M occur
1057 # In 3.2.0, the format was different (NF?_NO)
1058 # The parsing will incorrectly determine these as
1059 # "yes", however, unicodedata.c will not perform quickchecks
1060 # for older versions, and no delta records will be created.
1061 quickchecks = [0] * 0x110000
1062 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001063 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1064 for s in file:
1065 if '#' in s:
1066 s = s[:s.index('#')]
1067 s = [i.strip() for i in s.split(';')]
1068 if len(s) < 2 or s[1] not in qc_order:
1069 continue
1070 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1071 quickcheck_shift = qc_order.index(s[1])*2
1072 quickcheck <<= quickcheck_shift
1073 if '..' not in s[0]:
1074 first = last = int(s[0], 16)
1075 else:
1076 first, last = [int(c, 16) for c in s[0].split('..')]
1077 for char in range(first, last+1):
1078 assert not (quickchecks[char]>>quickcheck_shift)&3
1079 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001080 for i in range(0, 0x110000):
1081 if table[i] is not None:
1082 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001083
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001084 with open_data(UNIHAN, version) as file:
1085 zip = zipfile.ZipFile(file)
1086 if version == '3.2.0':
1087 data = zip.open('Unihan-3.2.0.txt').read()
1088 else:
1089 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001090 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001091 if not line.startswith('U+'):
1092 continue
1093 code, tag, value = line.split(None, 3)[:3]
1094 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1095 'kOtherNumeric'):
1096 continue
1097 value = value.strip().replace(',', '')
1098 i = int(code[2:], 16)
1099 # Patch the numeric field
1100 if table[i] is not None:
1101 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001102 sc = self.special_casing = {}
1103 with open_data(SPECIAL_CASING, version) as file:
1104 for s in file:
1105 s = s[:-1].split('#', 1)[0]
1106 if not s:
1107 continue
1108 data = s.split("; ")
1109 if data[4]:
1110 # We ignore all conditionals (since they depend on
1111 # languages) except for one, which is hardcoded. See
1112 # handle_capital_sigma in unicodeobject.c.
1113 continue
1114 c = int(data[0], 16)
1115 lower = [int(char, 16) for char in data[1].split()]
1116 title = [int(char, 16) for char in data[2].split()]
1117 upper = [int(char, 16) for char in data[3].split()]
1118 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001119 cf = self.case_folding = {}
1120 if version != '3.2.0':
1121 with open_data(CASE_FOLDING, version) as file:
1122 for s in file:
1123 s = s[:-1].split('#', 1)[0]
1124 if not s:
1125 continue
1126 data = s.split("; ")
1127 if data[1] in "CF":
1128 c = int(data[0], 16)
1129 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001130
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001131 def uselatin1(self):
1132 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001133 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001134
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001135# hash table tools
1136
1137# this is a straight-forward reimplementation of Python's built-in
1138# dictionary type, using a static data structure, and a custom string
1139# hash algorithm.
1140
1141def myhash(s, magic):
1142 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001143 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001144 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001145 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001146 if ix:
1147 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1148 return h
1149
1150SIZES = [
1151 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1152 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1153 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1154 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1155]
1156
1157class Hash:
1158 def __init__(self, name, data, magic):
1159 # turn a (key, value) list into a static hash table structure
1160
1161 # determine table size
1162 for size, poly in SIZES:
1163 if size > len(data):
1164 poly = size + poly
1165 break
1166 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001167 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001168
Collin Winter6afaeb72007-08-03 17:06:41 +00001169 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001170
1171 table = [None] * size
1172
1173 mask = size-1
1174
1175 n = 0
1176
1177 hash = myhash
1178
1179 # initialize hash table
1180 for key, value in data:
1181 h = hash(key, magic)
1182 i = (~h) & mask
1183 v = table[i]
1184 if v is None:
1185 table[i] = value
1186 continue
1187 incr = (h ^ (h >> 3)) & mask;
1188 if not incr:
1189 incr = mask
1190 while 1:
1191 n = n + 1
1192 i = (i + incr) & mask
1193 v = table[i]
1194 if v is None:
1195 table[i] = value
1196 break
1197 incr = incr << 1
1198 if incr > mask:
1199 incr = incr ^ poly
1200
Collin Winter6afaeb72007-08-03 17:06:41 +00001201 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001202 self.collisions = n
1203
1204 for i in range(len(table)):
1205 if table[i] is None:
1206 table[i] = 0
1207
1208 self.data = Array(name + "_hash", table)
1209 self.magic = magic
1210 self.name = name
1211 self.size = size
1212 self.poly = poly
1213
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001214 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001215 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001216 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001217 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1218 file.write("#define %s_size %d\n" % (self.name, self.size))
1219 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1220
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001221# stuff to deal with arrays of unsigned integers
1222
1223class Array:
1224
1225 def __init__(self, name, data):
1226 self.name = name
1227 self.data = data
1228
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001229 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001230 # write data to file, as a C array
1231 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001232 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001233 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001234 file.write("static ")
1235 if size == 1:
1236 file.write("unsigned char")
1237 elif size == 2:
1238 file.write("unsigned short")
1239 else:
1240 file.write("unsigned int")
1241 file.write(" " + self.name + "[] = {\n")
1242 if self.data:
1243 s = " "
1244 for item in self.data:
1245 i = str(item) + ", "
1246 if len(s) + len(i) > 78:
1247 file.write(s + "\n")
1248 s = " " + i
1249 else:
1250 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001251 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001252 file.write(s + "\n")
1253 file.write("};\n\n")
1254
1255def getsize(data):
1256 # return smallest possible integer size for the given array
1257 maxdata = max(data)
1258 if maxdata < 256:
1259 return 1
1260 elif maxdata < 65536:
1261 return 2
1262 else:
1263 return 4
1264
Tim Peters21013482000-09-25 07:13:41 +00001265def splitbins(t, trace=0):
1266 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1267
1268 t is a sequence of ints. This function can be useful to save space if
1269 many of the ints are the same. t1 and t2 are lists of ints, and shift
1270 is an int, chosen to minimize the combined size of t1 and t2 (in C
1271 code), and where for each i in range(len(t)),
1272 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1273 where mask is a bitmask isolating the last "shift" bits.
1274
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001275 If optional arg trace is non-zero (default zero), progress info
1276 is printed to sys.stderr. The higher the value, the more info
1277 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001278 """
1279
Tim Peters21013482000-09-25 07:13:41 +00001280 if trace:
1281 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001282 print("%d+%d bins at shift %d; %d bytes" % (
1283 len(t1), len(t2), shift, bytes), file=sys.stderr)
1284 print("Size of original table:", len(t)*getsize(t), \
1285 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001286 n = len(t)-1 # last valid index
1287 maxshift = 0 # the most we can shift n and still have something left
1288 if n > 0:
1289 while n >> 1:
1290 n >>= 1
1291 maxshift += 1
1292 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001293 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001294 t = tuple(t) # so slices can be dict keys
1295 for shift in range(maxshift + 1):
1296 t1 = []
1297 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001298 size = 2**shift
1299 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001300 for i in range(0, len(t), size):
1301 bin = t[i:i+size]
1302 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001303 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001304 index = len(t2)
1305 bincache[bin] = index
1306 t2.extend(bin)
1307 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001308 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001309 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001310 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001311 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001312 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001313 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001314 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001315 t1, t2, shift = best
1316 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001317 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001318 dump(t1, t2, shift, bytes)
1319 if __debug__:
1320 # exhaustively verify that the decomposition is correct
1321 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001322 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001323 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1324 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001325
1326if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001327 maketables(1)