blob: 140fc6484fa69d3c6e7571cba810b7ff6749d88e [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
35from operator import itemgetter
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000038VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000041UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042UNICODE_DATA = "UnicodeData%s.txt"
43COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
44EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000045UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000046DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000047DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000048LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030049NAME_ALIASES = "NameAliases%s.txt"
50NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050051SPECIAL_CASING = "SpecialCasing%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030052
53# Private Use Areas -- in planes 1, 15, 16
54PUA_1 = range(0xE000, 0xF900)
55PUA_15 = range(0xF0000, 0xFFFFE)
56PUA_16 = range(0x100000, 0x10FFFE)
57
58# we use this ranges of PUA_15 to store name aliases and named sequences
59NAME_ALIASES_START = 0xF0000
60NAMED_SEQUENCES_START = 0xF0100
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000061
62old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063
64CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
65 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
66 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
67 "So" ]
68
69BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
70 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
71 "ON" ]
72
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000073EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
74
Florent Xicluna806d8cf2010-03-30 19:34:18 +000075MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
76
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000077# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000078ALPHA_MASK = 0x01
79DECIMAL_MASK = 0x02
80DIGIT_MASK = 0x04
81LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083SPACE_MASK = 0x20
84TITLE_MASK = 0x40
85UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000086XID_START_MASK = 0x100
87XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000088PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050089NUMERIC_MASK = 0x800
90CASE_IGNORABLE_MASK = 0x1000
91CASED_MASK = 0x2000
92EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000093
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000094# these ranges need to match unicodedata.c:is_unified_ideograph
95cjk_ranges = [
96 ('3400', '4DB5'),
97 ('4E00', '9FCB'),
98 ('20000', '2A6D6'),
99 ('2A700', '2B734'),
100 ('2B740', '2B81D')
101]
102
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000103def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104
Collin Winter6afaeb72007-08-03 17:06:41 +0000105 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000107 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000108 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109
Georg Brandl559e5d72008-06-11 18:37:52 +0000110 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000111
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000113 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000114 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000115 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000116 merge_old_version(version, unicode, old_unicode)
117
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000118 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000119 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000120 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000121
122# --------------------------------------------------------------------
123# unicode character properties
124
125def makeunicodedata(unicode, trace):
126
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000127 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 table = [dummy]
129 cache = {0: dummy}
130 index = [0] * len(unicode.chars)
131
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000132 FILE = "Modules/unicodedata_db.h"
133
Collin Winter6afaeb72007-08-03 17:06:41 +0000134 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000135
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000136 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000137
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000138 for char in unicode.chars:
139 record = unicode.table[char]
140 if record:
141 # extract database properties
142 category = CATEGORY_NAMES.index(record[2])
143 combining = int(record[3])
144 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
145 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000146 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000147 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000148 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000149 category, combining, bidirectional, mirrored, eastasianwidth,
150 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000151 )
152 # add entry to index and item tables
153 i = cache.get(item)
154 if i is None:
155 cache[item] = i = len(table)
156 table.append(item)
157 index[char] = i
158
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000159 # 2) decomposition data
160
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000161 decomp_data = [0]
162 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000163 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000165
Martin v. Löwis677bde22002-11-23 22:08:15 +0000166 comp_pairs = []
167 comp_first = [None] * len(unicode.chars)
168 comp_last = [None] * len(unicode.chars)
169
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 for char in unicode.chars:
171 record = unicode.table[char]
172 if record:
173 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000174 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000175 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000176 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 # prefix
178 if decomp[0][0] == "<":
179 prefix = decomp.pop(0)
180 else:
181 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000182 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 i = decomp_prefix.index(prefix)
184 except ValueError:
185 i = len(decomp_prefix)
186 decomp_prefix.append(prefix)
187 prefix = i
188 assert prefix < 256
189 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000190 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000191 # Collect NFC pairs
192 if not prefix and len(decomp) == 3 and \
193 char not in unicode.exclusions and \
194 unicode.table[decomp[1]][3] == "0":
195 p, l, r = decomp
196 comp_first[l] = 1
197 comp_last[r] = 1
198 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000199 try:
200 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000201 except ValueError:
202 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203 decomp_data.extend(decomp)
204 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000205 else:
206 i = 0
207 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000208
Martin v. Löwis677bde22002-11-23 22:08:15 +0000209 f = l = 0
210 comp_first_ranges = []
211 comp_last_ranges = []
212 prev_f = prev_l = None
213 for i in unicode.chars:
214 if comp_first[i] is not None:
215 comp_first[i] = f
216 f += 1
217 if prev_f is None:
218 prev_f = (i,i)
219 elif prev_f[1]+1 == i:
220 prev_f = prev_f[0],i
221 else:
222 comp_first_ranges.append(prev_f)
223 prev_f = (i,i)
224 if comp_last[i] is not None:
225 comp_last[i] = l
226 l += 1
227 if prev_l is None:
228 prev_l = (i,i)
229 elif prev_l[1]+1 == i:
230 prev_l = prev_l[0],i
231 else:
232 comp_last_ranges.append(prev_l)
233 prev_l = (i,i)
234 comp_first_ranges.append(prev_f)
235 comp_last_ranges.append(prev_l)
236 total_first = f
237 total_last = l
238
239 comp_data = [0]*(total_first*total_last)
240 for f,l,char in comp_pairs:
241 f = comp_first[f]
242 l = comp_last[l]
243 comp_data[f*total_last+l] = char
244
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print(len(table), "unique properties")
246 print(len(decomp_prefix), "unique decomposition prefixes")
247 print(len(decomp_data), "unique decomposition entries:", end=' ')
248 print(decomp_size, "bytes")
249 print(total_first, "first characters in NFC")
250 print(total_last, "last characters in NFC")
251 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000252
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000254
Fred Drake9c685052000-10-26 03:56:46 +0000255 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
257 print(file=fp)
258 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
259 print("/* a list of unique database records */", file=fp)
260 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000262 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("};", file=fp)
264 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print("/* Reindexing of NFC first characters. */", file=fp)
267 print("#define TOTAL_FIRST",total_first, file=fp)
268 print("#define TOTAL_LAST",total_last, file=fp)
269 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000270 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000271 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
273 print(" {0,0,0}", file=fp)
274 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000275 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
278 print(" {0,0,0}", file=fp)
279 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000280
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000281 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000282 # the support code moved into unicodedatabase.c
283
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print("/* string literals */", file=fp)
285 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000286 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000287 print(" \"%s\"," % name, file=fp)
288 print(" NULL", file=fp)
289 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print(" \"%s\"," % name, file=fp)
294 print(" NULL", file=fp)
295 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000298 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print(" \"%s\"," % name, file=fp)
300 print(" NULL", file=fp)
301 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print(" \"%s\"," % name, file=fp)
306 print(" NULL", file=fp)
307 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000308
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000309 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000310 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000311
Collin Winter6afaeb72007-08-03 17:06:41 +0000312 print("/* index tables for the database records */", file=fp)
313 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000314 Array("index1", index1).dump(fp, trace)
315 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000316
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000317 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000318 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000319
Collin Winter6afaeb72007-08-03 17:06:41 +0000320 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000321 Array("decomp_data", decomp_data).dump(fp, trace)
322
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("/* index tables for the decomposition data */", file=fp)
324 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000325 Array("decomp_index1", index1).dump(fp, trace)
326 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000327
Martin v. Löwis677bde22002-11-23 22:08:15 +0000328 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("/* NFC pairs */", file=fp)
330 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000331 Array("comp_index", index).dump(fp, trace)
332 Array("comp_data", index2).dump(fp, trace)
333
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000334 # Generate delta tables for old versions
335 for version, table, normalization in unicode.changed:
336 cversion = version.replace(".","_")
337 records = [table[0]]
338 cache = {table[0]:0}
339 index = [0] * len(table)
340 for i, record in enumerate(table):
341 try:
342 index[i] = cache[record]
343 except KeyError:
344 index[i] = cache[record] = len(records)
345 records.append(record)
346 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000347 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000348 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000349 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
350 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000351 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
352 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
354 print("{", file=fp)
355 print("\tint index;", file=fp)
356 print("\tif (n >= 0x110000) index = 0;", file=fp)
357 print("\telse {", file=fp)
358 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
359 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
360 (cversion, shift, ((1<<shift)-1)), file=fp)
361 print("\t}", file=fp)
362 print("\treturn change_records_%s+index;" % cversion, file=fp)
363 print("}\n", file=fp)
364 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
365 print("{", file=fp)
366 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000367 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000368 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
369 print("\tdefault: return 0;", file=fp)
370 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000371
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000372 fp.close()
373
374# --------------------------------------------------------------------
375# unicode character type tables
376
377def makeunicodetype(unicode, trace):
378
379 FILE = "Objects/unicodetype_db.h"
380
Collin Winter6afaeb72007-08-03 17:06:41 +0000381 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382
383 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000384 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 table = [dummy]
386 cache = {0: dummy}
387 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000388 numeric = {}
389 spaces = []
390 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500391 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 for char in unicode.chars:
394 record = unicode.table[char]
395 if record:
396 # extract database properties
397 category = record[2]
398 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000399 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
403 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500404 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000405 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000406 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000407 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000408 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 if category == "Zs" or bidirectional in ("WS", "B", "S"):
410 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000411 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000412 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000413 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500414 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000415 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000416 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000417 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000418 if "XID_Start" in properties:
419 flags |= XID_START_MASK
420 if "XID_Continue" in properties:
421 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500422 if "Cased" in properties:
423 flags |= CASED_MASK
424 if "Case_Ignorable" in properties:
425 flags |= CASE_IGNORABLE_MASK
426 sc = unicode.special_casing.get(char)
427 if sc is None:
428 if record[12]:
429 upper = int(record[12], 16)
430 else:
431 upper = char
432 if record[13]:
433 lower = int(record[13], 16)
434 else:
435 lower = char
436 if record[14]:
437 title = int(record[14], 16)
438 else:
439 title = upper
440 if upper == lower == title:
441 upper = lower = title = 0
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000442 else:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500443 # This happens when some character maps to more than one
444 # character in uppercase, lowercase, or titlecase. The extra
445 # characters are stored in a different array.
446 flags |= EXTENDED_CASE_MASK
447 lower = len(extra_casing) | (len(sc[0]) << 24)
448 extra_casing.extend(sc[0])
449 upper = len(extra_casing) | (len(sc[2]) << 24)
450 extra_casing.extend(sc[2])
451 # Title is probably equal to upper.
452 if sc[1] == sc[2]:
453 title = upper
454 else:
455 title = len(extra_casing) | (len(sc[1]) << 24)
456 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000457 # decimal digit, integer digit
458 decimal = 0
459 if record[6]:
460 flags |= DECIMAL_MASK
461 decimal = int(record[6])
462 digit = 0
463 if record[7]:
464 flags |= DIGIT_MASK
465 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000466 if record[8]:
467 flags |= NUMERIC_MASK
468 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000469 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000470 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000471 )
472 # add entry to index and item tables
473 i = cache.get(item)
474 if i is None:
475 cache[item] = i = len(table)
476 table.append(item)
477 index[char] = i
478
Collin Winter6afaeb72007-08-03 17:06:41 +0000479 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000480 print(sum(map(len, numeric.values())), "numeric code points")
481 print(len(spaces), "whitespace code points")
482 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500483 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000484
Collin Winter6afaeb72007-08-03 17:06:41 +0000485 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000486
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000487 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000488 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
489 print(file=fp)
490 print("/* a list of unique character type descriptors */", file=fp)
491 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000492 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000493 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
494 print("};", file=fp)
495 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000496
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500497 print("/* extended case mappings */", file=fp)
498 print(file=fp)
499 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
500 for c in extra_casing:
501 print(" %d," % c, file=fp)
502 print("};", file=fp)
503 print(file=fp)
504
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000505 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000506 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000507
Collin Winter6afaeb72007-08-03 17:06:41 +0000508 print("/* type indexes */", file=fp)
509 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000510 Array("index1", index1).dump(fp, trace)
511 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000512
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000513 # Generate code for _PyUnicode_ToNumeric()
514 numeric_items = sorted(numeric.items())
515 print('/* Returns the numeric value as double for Unicode characters', file=fp)
516 print(' * having this property, -1.0 otherwise.', file=fp)
517 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000518 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000519 print('{', file=fp)
520 print(' switch (ch) {', file=fp)
521 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000522 # Turn text into float literals
523 parts = value.split('/')
524 parts = [repr(float(part)) for part in parts]
525 value = '/'.join(parts)
526
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000527 codepoints.sort()
528 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000529 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000530 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000531 print(' }', file=fp)
532 print(' return -1.0;', file=fp)
533 print('}', file=fp)
534 print(file=fp)
535
536 # Generate code for _PyUnicode_IsWhitespace()
537 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
538 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
539 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000540 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000541 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000542 print(' switch (ch) {', file=fp)
543
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000544 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000545 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000546 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000547
548 print(' }', file=fp)
549 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print('}', file=fp)
551 print(file=fp)
552
553 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000554 print("/* Returns 1 for Unicode characters having the line break", file=fp)
555 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
556 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000557 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000558 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000559 print('{', file=fp)
560 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000561 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000564
565 print(' }', file=fp)
566 print(' return 0;', file=fp)
567 print('}', file=fp)
568 print(file=fp)
569
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000570 fp.close()
571
572# --------------------------------------------------------------------
573# unicode name database
574
575def makeunicodename(unicode, trace):
576
577 FILE = "Modules/unicodename_db.h"
578
Collin Winter6afaeb72007-08-03 17:06:41 +0000579 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000580
581 # collect names
582 names = [None] * len(unicode.chars)
583
584 for char in unicode.chars:
585 record = unicode.table[char]
586 if record:
587 name = record[1].strip()
588 if name and name[0] != "<":
589 names[char] = name + chr(0)
590
Georg Brandl559e5d72008-06-11 18:37:52 +0000591 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592
593 # collect unique words from names (note that we differ between
594 # words inside a sentence, and words ending a sentence. the
595 # latter includes the trailing null byte.
596
597 words = {}
598 n = b = 0
599 for char in unicode.chars:
600 name = names[char]
601 if name:
602 w = name.split()
603 b = b + len(name)
604 n = n + len(w)
605 for w in w:
606 l = words.get(w)
607 if l:
608 l.append(None)
609 else:
610 words[w] = [len(words)]
611
Collin Winter6afaeb72007-08-03 17:06:41 +0000612 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000614 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000615
Martin v. Löwis97225da2002-11-24 23:05:09 +0000616 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000617 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000618 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000619 return -len(alist), aword
620 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000621
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000622 # figure out how many phrasebook escapes we need
623 escapes = 0
624 while escapes * 256 < len(wordlist):
625 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000626 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000627
628 short = 256 - escapes
629
630 assert short > 0
631
Collin Winter6afaeb72007-08-03 17:06:41 +0000632 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000633
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000634 # statistics
635 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000636 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000638 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000639
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000640 # pick the most commonly used words, and sort the rest on falling
641 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000642
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000644 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000645 wordlist.extend(wordtail)
646
647 # generate lexicon from words
648
649 lexicon_offset = [0]
650 lexicon = ""
651 words = {}
652
653 # build a lexicon string
654 offset = 0
655 for w, x in wordlist:
656 # encoding: bit 7 indicates last character in word (chr(128)
657 # indicates the last character in an entire string)
658 ww = w[:-1] + chr(ord(w[-1])+128)
659 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000660 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661 if o < 0:
662 o = offset
663 lexicon = lexicon + ww
664 offset = offset + len(w)
665 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000666 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000668 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000669
670 # generate phrasebook from names and lexicon
671 phrasebook = [0]
672 phrasebook_offset = [0] * len(unicode.chars)
673 for char in unicode.chars:
674 name = names[char]
675 if name:
676 w = name.split()
677 phrasebook_offset[char] = len(phrasebook)
678 for w in w:
679 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000680 if i < short:
681 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000683 # store as two bytes
684 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685 phrasebook.append(i&255)
686
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000687 assert getsize(phrasebook) == 1
688
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000689 #
690 # unicode name hash table
691
692 # extract names
693 data = []
694 for char in unicode.chars:
695 record = unicode.table[char]
696 if record:
697 name = record[1].strip()
698 if name and name[0] != "<":
699 data.append((name, char))
700
701 # the magic number 47 was chosen to minimize the number of
702 # collisions on the current data set. if you like, change it
703 # and see what happens...
704
705 codehash = Hash("code", data, 47)
706
Collin Winter6afaeb72007-08-03 17:06:41 +0000707 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000708
709 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000710 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
711 print(file=fp)
712 print("#define NAME_MAXLEN", 256, file=fp)
713 print(file=fp)
714 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000715 Array("lexicon", lexicon).dump(fp, trace)
716 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000717
718 # split decomposition index table
719 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
720
Collin Winter6afaeb72007-08-03 17:06:41 +0000721 print("/* code->name phrasebook */", file=fp)
722 print("#define phrasebook_shift", shift, file=fp)
723 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000725 Array("phrasebook", phrasebook).dump(fp, trace)
726 Array("phrasebook_offset1", offset1).dump(fp, trace)
727 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000728
Collin Winter6afaeb72007-08-03 17:06:41 +0000729 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000730 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000731
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300732 print(file=fp)
733 print('static const unsigned int aliases_start = %#x;' %
734 NAME_ALIASES_START, file=fp)
735 print('static const unsigned int aliases_end = %#x;' %
736 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
737
738 print('static const unsigned int name_aliases[] = {', file=fp)
739 for name, codepoint in unicode.aliases:
740 print(' 0x%04X,' % codepoint, file=fp)
741 print('};', file=fp)
742
743 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
744 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
745 # sequences or sequences with non-BMP chars are added.
746 # unicodedata_lookup should be adapted too.
747 print(dedent("""
748 typedef struct NamedSequence {
749 int seqlen;
750 Py_UCS2 seq[4];
751 } named_sequence;
752 """), file=fp)
753
754 print('static const unsigned int named_sequences_start = %#x;' %
755 NAMED_SEQUENCES_START, file=fp)
756 print('static const unsigned int named_sequences_end = %#x;' %
757 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
758
759 print('static const named_sequence named_sequences[] = {', file=fp)
760 for name, sequence in unicode.named_sequences:
761 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
762 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
763 print('};', file=fp)
764
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000765 fp.close()
766
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000767
768def merge_old_version(version, new, old):
769 # Changes to exclusion file not implemented yet
770 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000771 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000772
773 # In these change records, 0xFF means "no change"
774 bidir_changes = [0xFF]*0x110000
775 category_changes = [0xFF]*0x110000
776 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000777 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000778 # In numeric data, 0 means "no change",
779 # -1 means "did not have a numeric value
780 numeric_changes = [0] * 0x110000
781 # normalization_changes is a list of key-value pairs
782 normalization_changes = []
783 for i in range(0x110000):
784 if new.table[i] is None:
785 # Characters unassigned in the new version ought to
786 # be unassigned in the old one
787 assert old.table[i] is None
788 continue
789 # check characters unassigned in the old version
790 if old.table[i] is None:
791 # category 0 is "unassigned"
792 category_changes[i] = 0
793 continue
794 # check characters that differ
795 if old.table[i] != new.table[i]:
796 for k in range(len(old.table[i])):
797 if old.table[i][k] != new.table[i][k]:
798 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300799 if k == 1 and i in PUA_15:
800 # the name is not set in the old.table, but in the
801 # new.table we are using it for aliases and named seq
802 assert value == ''
803 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
805 category_changes[i] = CATEGORY_NAMES.index(value)
806 elif k == 4:
807 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
808 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
809 elif k == 5:
810 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
811 # We assume that all normalization changes are in 1:1 mappings
812 assert " " not in value
813 normalization_changes.append((i, value))
814 elif k == 6:
815 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
816 # we only support changes where the old value is a single digit
817 assert value in "0123456789"
818 decimal_changes[i] = int(value)
819 elif k == 8:
820 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
821 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000822 if not value:
823 numeric_changes[i] = -1
824 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000825 numeric_changes[i] = float(value)
826 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000827 elif k == 9:
828 if value == 'Y':
829 mirrored_changes[i] = '1'
830 else:
831 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000832 elif k == 11:
833 # change to ISO comment, ignore
834 pass
835 elif k == 12:
836 # change to simple uppercase mapping; ignore
837 pass
838 elif k == 13:
839 # change to simple lowercase mapping; ignore
840 pass
841 elif k == 14:
842 # change to simple titlecase mapping; ignore
843 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000844 elif k == 16:
845 # derived property changes; not yet
846 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000847 elif k == 17:
848 # normalization quickchecks are not performed
849 # for older versions
850 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000851 else:
852 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000853 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000854 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000855 decimal_changes, mirrored_changes,
856 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000858
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000859def open_data(template, version):
860 local = template % ('-'+version,)
861 if not os.path.exists(local):
862 import urllib.request
863 if version == '3.2.0':
864 # irregular url structure
865 url = 'http://www.unicode.org/Public/3.2-Update/' + local
866 else:
867 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
868 urllib.request.urlretrieve(url, filename=local)
869 if local.endswith('.txt'):
870 return open(local, encoding='utf-8')
871 else:
872 # Unihan.zip
873 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000874
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000875# --------------------------------------------------------------------
876# the following support code is taken from the unidb utilities
877# Copyright (c) 1999-2000 by Secret Labs AB
878
879# load a unicode-data file from disk
880
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000881class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000882 # Record structure:
883 # [ID, name, category, combining, bidi, decomp, (6)
884 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
885 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
886 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000887
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000888 def __init__(self, version,
889 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000890 expand=1,
891 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000892 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000893 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300894 with open_data(UNICODE_DATA, version) as file:
895 while 1:
896 s = file.readline()
897 if not s:
898 break
899 s = s.strip().split(";")
900 char = int(s[0], 16)
901 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000902
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000903 cjk_ranges_found = []
904
Martin v. Löwis97225da2002-11-24 23:05:09 +0000905 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000906 if expand:
907 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000908 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000909 s = table[i]
910 if s:
911 if s[1][-6:] == "First>":
912 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000913 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000914 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000915 if s[1].startswith("<CJK Ideograph"):
916 cjk_ranges_found.append((field[0],
917 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000918 s[1] = ""
919 field = None
920 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000921 f2 = field[:]
922 f2[0] = "%X" % i
923 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000924 if cjk_check and cjk_ranges != cjk_ranges_found:
925 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000926
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000927 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000928 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000929 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000930 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000931
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300932 # check for name aliases and named sequences, see #12753
933 # aliases and named sequences are not in 3.2.0
934 if version != '3.2.0':
935 self.aliases = []
936 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
937 # in order to take advantage of the compression and lookup
938 # algorithms used for the other characters
939 pua_index = NAME_ALIASES_START
940 with open_data(NAME_ALIASES, version) as file:
941 for s in file:
942 s = s.strip()
943 if not s or s.startswith('#'):
944 continue
945 char, name = s.split(';')
946 char = int(char, 16)
947 self.aliases.append((name, char))
948 # also store the name in the PUA 1
949 self.table[pua_index][1] = name
950 pua_index += 1
951 assert pua_index - NAME_ALIASES_START == len(self.aliases)
952
953 self.named_sequences = []
954 # store named seqences in the PUA 1, in range U+F0100..,
955 # in order to take advantage of the compression and lookup
956 # algorithms used for the other characters.
957
958 pua_index = NAMED_SEQUENCES_START
959 with open_data(NAMED_SEQUENCES, version) as file:
960 for s in file:
961 s = s.strip()
962 if not s or s.startswith('#'):
963 continue
964 name, chars = s.split(';')
965 chars = tuple(int(char, 16) for char in chars.split())
966 # check that the structure defined in makeunicodename is OK
967 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
968 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
969 "the NamedSequence struct and in unicodedata_lookup")
970 self.named_sequences.append((name, chars))
971 # also store these in the PUA 1
972 self.table[pua_index][1] = name
973 pua_index += 1
974 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
975
Martin v. Löwis677bde22002-11-23 22:08:15 +0000976 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300977 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
978 for s in file:
979 s = s.strip()
980 if not s:
981 continue
982 if s[0] == '#':
983 continue
984 char = int(s.split()[0],16)
985 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000986
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000987 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300988 with open_data(EASTASIAN_WIDTH, version) as file:
989 for s in file:
990 s = s.strip()
991 if not s:
992 continue
993 if s[0] == '#':
994 continue
995 s = s.split()[0].split(';')
996 if '..' in s[0]:
997 first, last = [int(c, 16) for c in s[0].split('..')]
998 chars = list(range(first, last+1))
999 else:
1000 chars = [int(s[0], 16)]
1001 for char in chars:
1002 widths[char] = s[1]
1003
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001004 for i in range(0, 0x110000):
1005 if table[i] is not None:
1006 table[i].append(widths[i])
1007
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001008 for i in range(0, 0x110000):
1009 if table[i] is not None:
1010 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001011
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001012 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1013 for s in file:
1014 s = s.split('#', 1)[0].strip()
1015 if not s:
1016 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001017
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001018 r, p = s.split(";")
1019 r = r.strip()
1020 p = p.strip()
1021 if ".." in r:
1022 first, last = [int(c, 16) for c in r.split('..')]
1023 chars = list(range(first, last+1))
1024 else:
1025 chars = [int(r, 16)]
1026 for char in chars:
1027 if table[char]:
1028 # Some properties (e.g. Default_Ignorable_Code_Point)
1029 # apply to unassigned code points; ignore them
1030 table[char][-1].add(p)
1031
1032 with open_data(LINE_BREAK, version) as file:
1033 for s in file:
1034 s = s.partition('#')[0]
1035 s = [i.strip() for i in s.split(';')]
1036 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1037 continue
1038 if '..' not in s[0]:
1039 first = last = int(s[0], 16)
1040 else:
1041 first, last = [int(c, 16) for c in s[0].split('..')]
1042 for char in range(first, last+1):
1043 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001044
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001045 # We only want the quickcheck properties
1046 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1047 # Yes is the default, hence only N and M occur
1048 # In 3.2.0, the format was different (NF?_NO)
1049 # The parsing will incorrectly determine these as
1050 # "yes", however, unicodedata.c will not perform quickchecks
1051 # for older versions, and no delta records will be created.
1052 quickchecks = [0] * 0x110000
1053 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001054 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1055 for s in file:
1056 if '#' in s:
1057 s = s[:s.index('#')]
1058 s = [i.strip() for i in s.split(';')]
1059 if len(s) < 2 or s[1] not in qc_order:
1060 continue
1061 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1062 quickcheck_shift = qc_order.index(s[1])*2
1063 quickcheck <<= quickcheck_shift
1064 if '..' not in s[0]:
1065 first = last = int(s[0], 16)
1066 else:
1067 first, last = [int(c, 16) for c in s[0].split('..')]
1068 for char in range(first, last+1):
1069 assert not (quickchecks[char]>>quickcheck_shift)&3
1070 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001071 for i in range(0, 0x110000):
1072 if table[i] is not None:
1073 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001074
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001075 with open_data(UNIHAN, version) as file:
1076 zip = zipfile.ZipFile(file)
1077 if version == '3.2.0':
1078 data = zip.open('Unihan-3.2.0.txt').read()
1079 else:
1080 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001081 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001082 if not line.startswith('U+'):
1083 continue
1084 code, tag, value = line.split(None, 3)[:3]
1085 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1086 'kOtherNumeric'):
1087 continue
1088 value = value.strip().replace(',', '')
1089 i = int(code[2:], 16)
1090 # Patch the numeric field
1091 if table[i] is not None:
1092 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001093 sc = self.special_casing = {}
1094 with open_data(SPECIAL_CASING, version) as file:
1095 for s in file:
1096 s = s[:-1].split('#', 1)[0]
1097 if not s:
1098 continue
1099 data = s.split("; ")
1100 if data[4]:
1101 # We ignore all conditionals (since they depend on
1102 # languages) except for one, which is hardcoded. See
1103 # handle_capital_sigma in unicodeobject.c.
1104 continue
1105 c = int(data[0], 16)
1106 lower = [int(char, 16) for char in data[1].split()]
1107 title = [int(char, 16) for char in data[2].split()]
1108 upper = [int(char, 16) for char in data[3].split()]
1109 sc[c] = (lower, title, upper)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001110
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001111 def uselatin1(self):
1112 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001113 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001114
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001115# hash table tools
1116
1117# this is a straight-forward reimplementation of Python's built-in
1118# dictionary type, using a static data structure, and a custom string
1119# hash algorithm.
1120
1121def myhash(s, magic):
1122 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001123 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001124 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001125 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001126 if ix:
1127 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1128 return h
1129
1130SIZES = [
1131 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1132 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1133 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1134 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1135]
1136
1137class Hash:
1138 def __init__(self, name, data, magic):
1139 # turn a (key, value) list into a static hash table structure
1140
1141 # determine table size
1142 for size, poly in SIZES:
1143 if size > len(data):
1144 poly = size + poly
1145 break
1146 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001147 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001148
Collin Winter6afaeb72007-08-03 17:06:41 +00001149 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001150
1151 table = [None] * size
1152
1153 mask = size-1
1154
1155 n = 0
1156
1157 hash = myhash
1158
1159 # initialize hash table
1160 for key, value in data:
1161 h = hash(key, magic)
1162 i = (~h) & mask
1163 v = table[i]
1164 if v is None:
1165 table[i] = value
1166 continue
1167 incr = (h ^ (h >> 3)) & mask;
1168 if not incr:
1169 incr = mask
1170 while 1:
1171 n = n + 1
1172 i = (i + incr) & mask
1173 v = table[i]
1174 if v is None:
1175 table[i] = value
1176 break
1177 incr = incr << 1
1178 if incr > mask:
1179 incr = incr ^ poly
1180
Collin Winter6afaeb72007-08-03 17:06:41 +00001181 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001182 self.collisions = n
1183
1184 for i in range(len(table)):
1185 if table[i] is None:
1186 table[i] = 0
1187
1188 self.data = Array(name + "_hash", table)
1189 self.magic = magic
1190 self.name = name
1191 self.size = size
1192 self.poly = poly
1193
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001194 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001195 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001196 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001197 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1198 file.write("#define %s_size %d\n" % (self.name, self.size))
1199 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1200
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001201# stuff to deal with arrays of unsigned integers
1202
1203class Array:
1204
1205 def __init__(self, name, data):
1206 self.name = name
1207 self.data = data
1208
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001209 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001210 # write data to file, as a C array
1211 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001212 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001213 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001214 file.write("static ")
1215 if size == 1:
1216 file.write("unsigned char")
1217 elif size == 2:
1218 file.write("unsigned short")
1219 else:
1220 file.write("unsigned int")
1221 file.write(" " + self.name + "[] = {\n")
1222 if self.data:
1223 s = " "
1224 for item in self.data:
1225 i = str(item) + ", "
1226 if len(s) + len(i) > 78:
1227 file.write(s + "\n")
1228 s = " " + i
1229 else:
1230 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001231 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001232 file.write(s + "\n")
1233 file.write("};\n\n")
1234
1235def getsize(data):
1236 # return smallest possible integer size for the given array
1237 maxdata = max(data)
1238 if maxdata < 256:
1239 return 1
1240 elif maxdata < 65536:
1241 return 2
1242 else:
1243 return 4
1244
Tim Peters21013482000-09-25 07:13:41 +00001245def splitbins(t, trace=0):
1246 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1247
1248 t is a sequence of ints. This function can be useful to save space if
1249 many of the ints are the same. t1 and t2 are lists of ints, and shift
1250 is an int, chosen to minimize the combined size of t1 and t2 (in C
1251 code), and where for each i in range(len(t)),
1252 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1253 where mask is a bitmask isolating the last "shift" bits.
1254
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001255 If optional arg trace is non-zero (default zero), progress info
1256 is printed to sys.stderr. The higher the value, the more info
1257 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001258 """
1259
Tim Peters21013482000-09-25 07:13:41 +00001260 if trace:
1261 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001262 print("%d+%d bins at shift %d; %d bytes" % (
1263 len(t1), len(t2), shift, bytes), file=sys.stderr)
1264 print("Size of original table:", len(t)*getsize(t), \
1265 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001266 n = len(t)-1 # last valid index
1267 maxshift = 0 # the most we can shift n and still have something left
1268 if n > 0:
1269 while n >> 1:
1270 n >>= 1
1271 maxshift += 1
1272 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001273 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001274 t = tuple(t) # so slices can be dict keys
1275 for shift in range(maxshift + 1):
1276 t1 = []
1277 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001278 size = 2**shift
1279 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001280 for i in range(0, len(t), size):
1281 bin = t[i:i+size]
1282 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001283 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001284 index = len(t2)
1285 bincache[bin] = index
1286 t2.extend(bin)
1287 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001288 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001289 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001290 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001291 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001292 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001293 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001294 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001295 t1, t2, shift = best
1296 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001297 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001298 dump(t1, t2, shift, bytes)
1299 if __debug__:
1300 # exhaustively verify that the decomposition is correct
1301 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001302 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001303 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1304 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001305
1306if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001307 maketables(1)