blob: 17edc3cf45b269020fb95587e82891d3dd2159f2 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
35from operator import itemgetter
Fredrik Lundhf367cac2000-09-24 23:18:31 +000036
37SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000038VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000039
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000040# The Unicode Database
Martin v. Löwisbaecd722010-10-11 22:42:28 +000041UNIDATA_VERSION = "6.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042UNICODE_DATA = "UnicodeData%s.txt"
43COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
44EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000045UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000046DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000047DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000048LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030049NAME_ALIASES = "NameAliases%s.txt"
50NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050051SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050052CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053
54# Private Use Areas -- in planes 1, 15, 16
55PUA_1 = range(0xE000, 0xF900)
56PUA_15 = range(0xF0000, 0xFFFFE)
57PUA_16 = range(0x100000, 0x10FFFE)
58
59# we use this ranges of PUA_15 to store name aliases and named sequences
60NAME_ALIASES_START = 0xF0000
61NAMED_SEQUENCES_START = 0xF0100
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000062
63old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000064
65CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
66 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
67 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
68 "So" ]
69
70BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
71 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
72 "ON" ]
73
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000074EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
75
Florent Xicluna806d8cf2010-03-30 19:34:18 +000076MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
77
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000078# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000079ALPHA_MASK = 0x01
80DECIMAL_MASK = 0x02
81DIGIT_MASK = 0x04
82LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000083LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000084SPACE_MASK = 0x20
85TITLE_MASK = 0x40
86UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000087XID_START_MASK = 0x100
88XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000089PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050090NUMERIC_MASK = 0x800
91CASE_IGNORABLE_MASK = 0x1000
92CASED_MASK = 0x2000
93EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000094
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000095# these ranges need to match unicodedata.c:is_unified_ideograph
96cjk_ranges = [
97 ('3400', '4DB5'),
98 ('4E00', '9FCB'),
99 ('20000', '2A6D6'),
100 ('2A700', '2B734'),
101 ('2B740', '2B81D')
102]
103
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000104def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000105
Collin Winter6afaeb72007-08-03 17:06:41 +0000106 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000107
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000108 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000109 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110
Georg Brandl559e5d72008-06-11 18:37:52 +0000111 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000112
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000114 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000115 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000116 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000117 merge_old_version(version, unicode, old_unicode)
118
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000119 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000120 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000121 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000122
123# --------------------------------------------------------------------
124# unicode character properties
125
126def makeunicodedata(unicode, trace):
127
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000128 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000129 table = [dummy]
130 cache = {0: dummy}
131 index = [0] * len(unicode.chars)
132
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000133 FILE = "Modules/unicodedata_db.h"
134
Collin Winter6afaeb72007-08-03 17:06:41 +0000135 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000136
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000137 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000138
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000139 for char in unicode.chars:
140 record = unicode.table[char]
141 if record:
142 # extract database properties
143 category = CATEGORY_NAMES.index(record[2])
144 combining = int(record[3])
145 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
146 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000147 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000148 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000149 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000150 category, combining, bidirectional, mirrored, eastasianwidth,
151 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 )
153 # add entry to index and item tables
154 i = cache.get(item)
155 if i is None:
156 cache[item] = i = len(table)
157 table.append(item)
158 index[char] = i
159
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000160 # 2) decomposition data
161
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000162 decomp_data = [0]
163 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000164 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166
Martin v. Löwis677bde22002-11-23 22:08:15 +0000167 comp_pairs = []
168 comp_first = [None] * len(unicode.chars)
169 comp_last = [None] * len(unicode.chars)
170
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171 for char in unicode.chars:
172 record = unicode.table[char]
173 if record:
174 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000175 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000176 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000177 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 # prefix
179 if decomp[0][0] == "<":
180 prefix = decomp.pop(0)
181 else:
182 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000183 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 i = decomp_prefix.index(prefix)
185 except ValueError:
186 i = len(decomp_prefix)
187 decomp_prefix.append(prefix)
188 prefix = i
189 assert prefix < 256
190 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000191 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000192 # Collect NFC pairs
193 if not prefix and len(decomp) == 3 and \
194 char not in unicode.exclusions and \
195 unicode.table[decomp[1]][3] == "0":
196 p, l, r = decomp
197 comp_first[l] = 1
198 comp_last[r] = 1
199 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 try:
201 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000202 except ValueError:
203 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 decomp_data.extend(decomp)
205 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000206 else:
207 i = 0
208 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000209
Martin v. Löwis677bde22002-11-23 22:08:15 +0000210 f = l = 0
211 comp_first_ranges = []
212 comp_last_ranges = []
213 prev_f = prev_l = None
214 for i in unicode.chars:
215 if comp_first[i] is not None:
216 comp_first[i] = f
217 f += 1
218 if prev_f is None:
219 prev_f = (i,i)
220 elif prev_f[1]+1 == i:
221 prev_f = prev_f[0],i
222 else:
223 comp_first_ranges.append(prev_f)
224 prev_f = (i,i)
225 if comp_last[i] is not None:
226 comp_last[i] = l
227 l += 1
228 if prev_l is None:
229 prev_l = (i,i)
230 elif prev_l[1]+1 == i:
231 prev_l = prev_l[0],i
232 else:
233 comp_last_ranges.append(prev_l)
234 prev_l = (i,i)
235 comp_first_ranges.append(prev_f)
236 comp_last_ranges.append(prev_l)
237 total_first = f
238 total_last = l
239
240 comp_data = [0]*(total_first*total_last)
241 for f,l,char in comp_pairs:
242 f = comp_first[f]
243 l = comp_last[l]
244 comp_data[f*total_last+l] = char
245
Collin Winter6afaeb72007-08-03 17:06:41 +0000246 print(len(table), "unique properties")
247 print(len(decomp_prefix), "unique decomposition prefixes")
248 print(len(decomp_data), "unique decomposition entries:", end=' ')
249 print(decomp_size, "bytes")
250 print(total_first, "first characters in NFC")
251 print(total_last, "last characters in NFC")
252 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000253
Collin Winter6afaeb72007-08-03 17:06:41 +0000254 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000255
Fred Drake9c685052000-10-26 03:56:46 +0000256 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
258 print(file=fp)
259 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
260 print("/* a list of unique database records */", file=fp)
261 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000262 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000263 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000264 print("};", file=fp)
265 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("/* Reindexing of NFC first characters. */", file=fp)
268 print("#define TOTAL_FIRST",total_first, file=fp)
269 print("#define TOTAL_LAST",total_last, file=fp)
270 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000271 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000272 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
274 print(" {0,0,0}", file=fp)
275 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000276 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
279 print(" {0,0,0}", file=fp)
280 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000281
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000282 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000283 # the support code moved into unicodedatabase.c
284
Collin Winter6afaeb72007-08-03 17:06:41 +0000285 print("/* string literals */", file=fp)
286 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000287 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print(" \"%s\"," % name, file=fp)
289 print(" NULL", file=fp)
290 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000294 print(" \"%s\"," % name, file=fp)
295 print(" NULL", file=fp)
296 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Collin Winter6afaeb72007-08-03 17:06:41 +0000298 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000299 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print(" \"%s\"," % name, file=fp)
301 print(" NULL", file=fp)
302 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000303
Collin Winter6afaeb72007-08-03 17:06:41 +0000304 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000305 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000306 print(" \"%s\"," % name, file=fp)
307 print(" NULL", file=fp)
308 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000309
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000310 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000311 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000312
Collin Winter6afaeb72007-08-03 17:06:41 +0000313 print("/* index tables for the database records */", file=fp)
314 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 Array("index1", index1).dump(fp, trace)
316 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000317
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000318 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000319 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000320
Collin Winter6afaeb72007-08-03 17:06:41 +0000321 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000322 Array("decomp_data", decomp_data).dump(fp, trace)
323
Collin Winter6afaeb72007-08-03 17:06:41 +0000324 print("/* index tables for the decomposition data */", file=fp)
325 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000326 Array("decomp_index1", index1).dump(fp, trace)
327 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000328
Martin v. Löwis677bde22002-11-23 22:08:15 +0000329 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000330 print("/* NFC pairs */", file=fp)
331 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000332 Array("comp_index", index).dump(fp, trace)
333 Array("comp_data", index2).dump(fp, trace)
334
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 # Generate delta tables for old versions
336 for version, table, normalization in unicode.changed:
337 cversion = version.replace(".","_")
338 records = [table[0]]
339 cache = {table[0]:0}
340 index = [0] * len(table)
341 for i, record in enumerate(table):
342 try:
343 index[i] = cache[record]
344 except KeyError:
345 index[i] = cache[record] = len(records)
346 records.append(record)
347 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000348 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000349 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000350 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
351 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
353 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000354 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
355 print("{", file=fp)
356 print("\tint index;", file=fp)
357 print("\tif (n >= 0x110000) index = 0;", file=fp)
358 print("\telse {", file=fp)
359 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
360 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
361 (cversion, shift, ((1<<shift)-1)), file=fp)
362 print("\t}", file=fp)
363 print("\treturn change_records_%s+index;" % cversion, file=fp)
364 print("}\n", file=fp)
365 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
366 print("{", file=fp)
367 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000368 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000369 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
370 print("\tdefault: return 0;", file=fp)
371 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000372
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000373 fp.close()
374
375# --------------------------------------------------------------------
376# unicode character type tables
377
378def makeunicodetype(unicode, trace):
379
380 FILE = "Objects/unicodetype_db.h"
381
Collin Winter6afaeb72007-08-03 17:06:41 +0000382 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000383
384 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000385 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386 table = [dummy]
387 cache = {0: dummy}
388 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000389 numeric = {}
390 spaces = []
391 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500392 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000393
394 for char in unicode.chars:
395 record = unicode.table[char]
396 if record:
397 # extract database properties
398 category = record[2]
399 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000400 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000401 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000402 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000403 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
404 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500405 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000407 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000408 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000409 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 if category == "Zs" or bidirectional in ("WS", "B", "S"):
411 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000412 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000413 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500415 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000417 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000418 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000419 if "XID_Start" in properties:
420 flags |= XID_START_MASK
421 if "XID_Continue" in properties:
422 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500423 if "Cased" in properties:
424 flags |= CASED_MASK
425 if "Case_Ignorable" in properties:
426 flags |= CASE_IGNORABLE_MASK
427 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500428 cf = unicode.case_folding.get(char, [char])
429 if record[12]:
430 upper = int(record[12], 16)
431 else:
432 upper = char
433 if record[13]:
434 lower = int(record[13], 16)
435 else:
436 lower = char
437 if record[14]:
438 title = int(record[14], 16)
439 else:
440 title = upper
441 if sc is None and cf != [lower]:
442 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500443 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500444 if upper == lower == title:
445 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500446 else:
447 upper = upper - char
448 lower = lower - char
449 title = title - char
450 assert (abs(upper) <= 2147483647 and
451 abs(lower) <= 2147483647 and
452 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000453 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500454 # This happens either when some character maps to more than one
455 # character in uppercase, lowercase, or titlecase or the
456 # casefolded version of the character is different from the
457 # lowercase. The extra characters are stored in a different
458 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500459 flags |= EXTENDED_CASE_MASK
460 lower = len(extra_casing) | (len(sc[0]) << 24)
461 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500462 if cf != sc[0]:
463 lower |= len(cf) << 20
464 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500465 upper = len(extra_casing) | (len(sc[2]) << 24)
466 extra_casing.extend(sc[2])
467 # Title is probably equal to upper.
468 if sc[1] == sc[2]:
469 title = upper
470 else:
471 title = len(extra_casing) | (len(sc[1]) << 24)
472 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000473 # decimal digit, integer digit
474 decimal = 0
475 if record[6]:
476 flags |= DECIMAL_MASK
477 decimal = int(record[6])
478 digit = 0
479 if record[7]:
480 flags |= DIGIT_MASK
481 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000482 if record[8]:
483 flags |= NUMERIC_MASK
484 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000485 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000486 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000487 )
488 # add entry to index and item tables
489 i = cache.get(item)
490 if i is None:
491 cache[item] = i = len(table)
492 table.append(item)
493 index[char] = i
494
Collin Winter6afaeb72007-08-03 17:06:41 +0000495 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000496 print(sum(map(len, numeric.values())), "numeric code points")
497 print(len(spaces), "whitespace code points")
498 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500499 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000500
Collin Winter6afaeb72007-08-03 17:06:41 +0000501 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000502
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000503 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000504 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
505 print(file=fp)
506 print("/* a list of unique character type descriptors */", file=fp)
507 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000508 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
510 print("};", file=fp)
511 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000512
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500513 print("/* extended case mappings */", file=fp)
514 print(file=fp)
515 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
516 for c in extra_casing:
517 print(" %d," % c, file=fp)
518 print("};", file=fp)
519 print(file=fp)
520
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000521 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000522 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000523
Collin Winter6afaeb72007-08-03 17:06:41 +0000524 print("/* type indexes */", file=fp)
525 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000526 Array("index1", index1).dump(fp, trace)
527 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000528
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000529 # Generate code for _PyUnicode_ToNumeric()
530 numeric_items = sorted(numeric.items())
531 print('/* Returns the numeric value as double for Unicode characters', file=fp)
532 print(' * having this property, -1.0 otherwise.', file=fp)
533 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000534 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000535 print('{', file=fp)
536 print(' switch (ch) {', file=fp)
537 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000538 # Turn text into float literals
539 parts = value.split('/')
540 parts = [repr(float(part)) for part in parts]
541 value = '/'.join(parts)
542
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000543 codepoints.sort()
544 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000545 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000546 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000547 print(' }', file=fp)
548 print(' return -1.0;', file=fp)
549 print('}', file=fp)
550 print(file=fp)
551
552 # Generate code for _PyUnicode_IsWhitespace()
553 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
554 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
555 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000556 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000557 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000558 print(' switch (ch) {', file=fp)
559
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000560 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563
564 print(' }', file=fp)
565 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print('}', file=fp)
567 print(file=fp)
568
569 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000570 print("/* Returns 1 for Unicode characters having the line break", file=fp)
571 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
572 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000573 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000574 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000575 print('{', file=fp)
576 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000577 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000580
581 print(' }', file=fp)
582 print(' return 0;', file=fp)
583 print('}', file=fp)
584 print(file=fp)
585
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000586 fp.close()
587
588# --------------------------------------------------------------------
589# unicode name database
590
591def makeunicodename(unicode, trace):
592
593 FILE = "Modules/unicodename_db.h"
594
Collin Winter6afaeb72007-08-03 17:06:41 +0000595 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000596
597 # collect names
598 names = [None] * len(unicode.chars)
599
600 for char in unicode.chars:
601 record = unicode.table[char]
602 if record:
603 name = record[1].strip()
604 if name and name[0] != "<":
605 names[char] = name + chr(0)
606
Georg Brandl559e5d72008-06-11 18:37:52 +0000607 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000608
609 # collect unique words from names (note that we differ between
610 # words inside a sentence, and words ending a sentence. the
611 # latter includes the trailing null byte.
612
613 words = {}
614 n = b = 0
615 for char in unicode.chars:
616 name = names[char]
617 if name:
618 w = name.split()
619 b = b + len(name)
620 n = n + len(w)
621 for w in w:
622 l = words.get(w)
623 if l:
624 l.append(None)
625 else:
626 words[w] = [len(words)]
627
Collin Winter6afaeb72007-08-03 17:06:41 +0000628 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000629
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000630 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000631
Martin v. Löwis97225da2002-11-24 23:05:09 +0000632 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000633 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000634 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000635 return -len(alist), aword
636 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000638 # figure out how many phrasebook escapes we need
639 escapes = 0
640 while escapes * 256 < len(wordlist):
641 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000642 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643
644 short = 256 - escapes
645
646 assert short > 0
647
Collin Winter6afaeb72007-08-03 17:06:41 +0000648 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000650 # statistics
651 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000653 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000654 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000655
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000656 # pick the most commonly used words, and sort the rest on falling
657 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000660 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661 wordlist.extend(wordtail)
662
663 # generate lexicon from words
664
665 lexicon_offset = [0]
666 lexicon = ""
667 words = {}
668
669 # build a lexicon string
670 offset = 0
671 for w, x in wordlist:
672 # encoding: bit 7 indicates last character in word (chr(128)
673 # indicates the last character in an entire string)
674 ww = w[:-1] + chr(ord(w[-1])+128)
675 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000676 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000677 if o < 0:
678 o = offset
679 lexicon = lexicon + ww
680 offset = offset + len(w)
681 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000682 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000683
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000684 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000685
686 # generate phrasebook from names and lexicon
687 phrasebook = [0]
688 phrasebook_offset = [0] * len(unicode.chars)
689 for char in unicode.chars:
690 name = names[char]
691 if name:
692 w = name.split()
693 phrasebook_offset[char] = len(phrasebook)
694 for w in w:
695 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000696 if i < short:
697 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000698 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000699 # store as two bytes
700 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701 phrasebook.append(i&255)
702
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000703 assert getsize(phrasebook) == 1
704
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000705 #
706 # unicode name hash table
707
708 # extract names
709 data = []
710 for char in unicode.chars:
711 record = unicode.table[char]
712 if record:
713 name = record[1].strip()
714 if name and name[0] != "<":
715 data.append((name, char))
716
717 # the magic number 47 was chosen to minimize the number of
718 # collisions on the current data set. if you like, change it
719 # and see what happens...
720
721 codehash = Hash("code", data, 47)
722
Collin Winter6afaeb72007-08-03 17:06:41 +0000723 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000724
725 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000726 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
727 print(file=fp)
728 print("#define NAME_MAXLEN", 256, file=fp)
729 print(file=fp)
730 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000731 Array("lexicon", lexicon).dump(fp, trace)
732 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000733
734 # split decomposition index table
735 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
736
Collin Winter6afaeb72007-08-03 17:06:41 +0000737 print("/* code->name phrasebook */", file=fp)
738 print("#define phrasebook_shift", shift, file=fp)
739 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000740
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000741 Array("phrasebook", phrasebook).dump(fp, trace)
742 Array("phrasebook_offset1", offset1).dump(fp, trace)
743 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Collin Winter6afaeb72007-08-03 17:06:41 +0000745 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000746 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300748 print(file=fp)
749 print('static const unsigned int aliases_start = %#x;' %
750 NAME_ALIASES_START, file=fp)
751 print('static const unsigned int aliases_end = %#x;' %
752 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
753
754 print('static const unsigned int name_aliases[] = {', file=fp)
755 for name, codepoint in unicode.aliases:
756 print(' 0x%04X,' % codepoint, file=fp)
757 print('};', file=fp)
758
759 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
760 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
761 # sequences or sequences with non-BMP chars are added.
762 # unicodedata_lookup should be adapted too.
763 print(dedent("""
764 typedef struct NamedSequence {
765 int seqlen;
766 Py_UCS2 seq[4];
767 } named_sequence;
768 """), file=fp)
769
770 print('static const unsigned int named_sequences_start = %#x;' %
771 NAMED_SEQUENCES_START, file=fp)
772 print('static const unsigned int named_sequences_end = %#x;' %
773 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
774
775 print('static const named_sequence named_sequences[] = {', file=fp)
776 for name, sequence in unicode.named_sequences:
777 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
778 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
779 print('};', file=fp)
780
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000781 fp.close()
782
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000783
784def merge_old_version(version, new, old):
785 # Changes to exclusion file not implemented yet
786 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000787 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788
789 # In these change records, 0xFF means "no change"
790 bidir_changes = [0xFF]*0x110000
791 category_changes = [0xFF]*0x110000
792 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000793 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000794 # In numeric data, 0 means "no change",
795 # -1 means "did not have a numeric value
796 numeric_changes = [0] * 0x110000
797 # normalization_changes is a list of key-value pairs
798 normalization_changes = []
799 for i in range(0x110000):
800 if new.table[i] is None:
801 # Characters unassigned in the new version ought to
802 # be unassigned in the old one
803 assert old.table[i] is None
804 continue
805 # check characters unassigned in the old version
806 if old.table[i] is None:
807 # category 0 is "unassigned"
808 category_changes[i] = 0
809 continue
810 # check characters that differ
811 if old.table[i] != new.table[i]:
812 for k in range(len(old.table[i])):
813 if old.table[i][k] != new.table[i][k]:
814 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300815 if k == 1 and i in PUA_15:
816 # the name is not set in the old.table, but in the
817 # new.table we are using it for aliases and named seq
818 assert value == ''
819 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000820 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
821 category_changes[i] = CATEGORY_NAMES.index(value)
822 elif k == 4:
823 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
824 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
825 elif k == 5:
826 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
827 # We assume that all normalization changes are in 1:1 mappings
828 assert " " not in value
829 normalization_changes.append((i, value))
830 elif k == 6:
831 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
832 # we only support changes where the old value is a single digit
833 assert value in "0123456789"
834 decimal_changes[i] = int(value)
835 elif k == 8:
836 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
837 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000838 if not value:
839 numeric_changes[i] = -1
840 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000841 numeric_changes[i] = float(value)
842 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000843 elif k == 9:
844 if value == 'Y':
845 mirrored_changes[i] = '1'
846 else:
847 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000848 elif k == 11:
849 # change to ISO comment, ignore
850 pass
851 elif k == 12:
852 # change to simple uppercase mapping; ignore
853 pass
854 elif k == 13:
855 # change to simple lowercase mapping; ignore
856 pass
857 elif k == 14:
858 # change to simple titlecase mapping; ignore
859 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000860 elif k == 16:
861 # derived property changes; not yet
862 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000863 elif k == 17:
864 # normalization quickchecks are not performed
865 # for older versions
866 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000867 else:
868 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000869 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000870 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000871 decimal_changes, mirrored_changes,
872 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000873 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000874
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000875def open_data(template, version):
876 local = template % ('-'+version,)
877 if not os.path.exists(local):
878 import urllib.request
879 if version == '3.2.0':
880 # irregular url structure
881 url = 'http://www.unicode.org/Public/3.2-Update/' + local
882 else:
883 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
884 urllib.request.urlretrieve(url, filename=local)
885 if local.endswith('.txt'):
886 return open(local, encoding='utf-8')
887 else:
888 # Unihan.zip
889 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000890
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000891# --------------------------------------------------------------------
892# the following support code is taken from the unidb utilities
893# Copyright (c) 1999-2000 by Secret Labs AB
894
895# load a unicode-data file from disk
896
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000897class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000898 # Record structure:
899 # [ID, name, category, combining, bidi, decomp, (6)
900 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
901 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
902 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000903
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000904 def __init__(self, version,
905 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000906 expand=1,
907 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000908 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000909 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300910 with open_data(UNICODE_DATA, version) as file:
911 while 1:
912 s = file.readline()
913 if not s:
914 break
915 s = s.strip().split(";")
916 char = int(s[0], 16)
917 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000918
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000919 cjk_ranges_found = []
920
Martin v. Löwis97225da2002-11-24 23:05:09 +0000921 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000922 if expand:
923 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000924 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000925 s = table[i]
926 if s:
927 if s[1][-6:] == "First>":
928 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000929 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000930 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000931 if s[1].startswith("<CJK Ideograph"):
932 cjk_ranges_found.append((field[0],
933 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000934 s[1] = ""
935 field = None
936 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000937 f2 = field[:]
938 f2[0] = "%X" % i
939 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000940 if cjk_check and cjk_ranges != cjk_ranges_found:
941 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000942
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000943 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000944 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000945 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000946 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000947
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300948 # check for name aliases and named sequences, see #12753
949 # aliases and named sequences are not in 3.2.0
950 if version != '3.2.0':
951 self.aliases = []
952 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
953 # in order to take advantage of the compression and lookup
954 # algorithms used for the other characters
955 pua_index = NAME_ALIASES_START
956 with open_data(NAME_ALIASES, version) as file:
957 for s in file:
958 s = s.strip()
959 if not s or s.startswith('#'):
960 continue
961 char, name = s.split(';')
962 char = int(char, 16)
963 self.aliases.append((name, char))
964 # also store the name in the PUA 1
965 self.table[pua_index][1] = name
966 pua_index += 1
967 assert pua_index - NAME_ALIASES_START == len(self.aliases)
968
969 self.named_sequences = []
970 # store named seqences in the PUA 1, in range U+F0100..,
971 # in order to take advantage of the compression and lookup
972 # algorithms used for the other characters.
973
974 pua_index = NAMED_SEQUENCES_START
975 with open_data(NAMED_SEQUENCES, version) as file:
976 for s in file:
977 s = s.strip()
978 if not s or s.startswith('#'):
979 continue
980 name, chars = s.split(';')
981 chars = tuple(int(char, 16) for char in chars.split())
982 # check that the structure defined in makeunicodename is OK
983 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
984 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
985 "the NamedSequence struct and in unicodedata_lookup")
986 self.named_sequences.append((name, chars))
987 # also store these in the PUA 1
988 self.table[pua_index][1] = name
989 pua_index += 1
990 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
991
Martin v. Löwis677bde22002-11-23 22:08:15 +0000992 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300993 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
994 for s in file:
995 s = s.strip()
996 if not s:
997 continue
998 if s[0] == '#':
999 continue
1000 char = int(s.split()[0],16)
1001 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001002
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001003 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001004 with open_data(EASTASIAN_WIDTH, version) as file:
1005 for s in file:
1006 s = s.strip()
1007 if not s:
1008 continue
1009 if s[0] == '#':
1010 continue
1011 s = s.split()[0].split(';')
1012 if '..' in s[0]:
1013 first, last = [int(c, 16) for c in s[0].split('..')]
1014 chars = list(range(first, last+1))
1015 else:
1016 chars = [int(s[0], 16)]
1017 for char in chars:
1018 widths[char] = s[1]
1019
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001020 for i in range(0, 0x110000):
1021 if table[i] is not None:
1022 table[i].append(widths[i])
1023
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001024 for i in range(0, 0x110000):
1025 if table[i] is not None:
1026 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001027
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001028 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1029 for s in file:
1030 s = s.split('#', 1)[0].strip()
1031 if not s:
1032 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001033
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001034 r, p = s.split(";")
1035 r = r.strip()
1036 p = p.strip()
1037 if ".." in r:
1038 first, last = [int(c, 16) for c in r.split('..')]
1039 chars = list(range(first, last+1))
1040 else:
1041 chars = [int(r, 16)]
1042 for char in chars:
1043 if table[char]:
1044 # Some properties (e.g. Default_Ignorable_Code_Point)
1045 # apply to unassigned code points; ignore them
1046 table[char][-1].add(p)
1047
1048 with open_data(LINE_BREAK, version) as file:
1049 for s in file:
1050 s = s.partition('#')[0]
1051 s = [i.strip() for i in s.split(';')]
1052 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1053 continue
1054 if '..' not in s[0]:
1055 first = last = int(s[0], 16)
1056 else:
1057 first, last = [int(c, 16) for c in s[0].split('..')]
1058 for char in range(first, last+1):
1059 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001060
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001061 # We only want the quickcheck properties
1062 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1063 # Yes is the default, hence only N and M occur
1064 # In 3.2.0, the format was different (NF?_NO)
1065 # The parsing will incorrectly determine these as
1066 # "yes", however, unicodedata.c will not perform quickchecks
1067 # for older versions, and no delta records will be created.
1068 quickchecks = [0] * 0x110000
1069 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001070 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1071 for s in file:
1072 if '#' in s:
1073 s = s[:s.index('#')]
1074 s = [i.strip() for i in s.split(';')]
1075 if len(s) < 2 or s[1] not in qc_order:
1076 continue
1077 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1078 quickcheck_shift = qc_order.index(s[1])*2
1079 quickcheck <<= quickcheck_shift
1080 if '..' not in s[0]:
1081 first = last = int(s[0], 16)
1082 else:
1083 first, last = [int(c, 16) for c in s[0].split('..')]
1084 for char in range(first, last+1):
1085 assert not (quickchecks[char]>>quickcheck_shift)&3
1086 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001087 for i in range(0, 0x110000):
1088 if table[i] is not None:
1089 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001090
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001091 with open_data(UNIHAN, version) as file:
1092 zip = zipfile.ZipFile(file)
1093 if version == '3.2.0':
1094 data = zip.open('Unihan-3.2.0.txt').read()
1095 else:
1096 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001097 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001098 if not line.startswith('U+'):
1099 continue
1100 code, tag, value = line.split(None, 3)[:3]
1101 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1102 'kOtherNumeric'):
1103 continue
1104 value = value.strip().replace(',', '')
1105 i = int(code[2:], 16)
1106 # Patch the numeric field
1107 if table[i] is not None:
1108 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001109 sc = self.special_casing = {}
1110 with open_data(SPECIAL_CASING, version) as file:
1111 for s in file:
1112 s = s[:-1].split('#', 1)[0]
1113 if not s:
1114 continue
1115 data = s.split("; ")
1116 if data[4]:
1117 # We ignore all conditionals (since they depend on
1118 # languages) except for one, which is hardcoded. See
1119 # handle_capital_sigma in unicodeobject.c.
1120 continue
1121 c = int(data[0], 16)
1122 lower = [int(char, 16) for char in data[1].split()]
1123 title = [int(char, 16) for char in data[2].split()]
1124 upper = [int(char, 16) for char in data[3].split()]
1125 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001126 cf = self.case_folding = {}
1127 if version != '3.2.0':
1128 with open_data(CASE_FOLDING, version) as file:
1129 for s in file:
1130 s = s[:-1].split('#', 1)[0]
1131 if not s:
1132 continue
1133 data = s.split("; ")
1134 if data[1] in "CF":
1135 c = int(data[0], 16)
1136 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001137
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001138 def uselatin1(self):
1139 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001140 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001141
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001142# hash table tools
1143
1144# this is a straight-forward reimplementation of Python's built-in
1145# dictionary type, using a static data structure, and a custom string
1146# hash algorithm.
1147
1148def myhash(s, magic):
1149 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001150 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001151 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001152 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001153 if ix:
1154 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1155 return h
1156
1157SIZES = [
1158 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1159 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1160 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1161 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1162]
1163
1164class Hash:
1165 def __init__(self, name, data, magic):
1166 # turn a (key, value) list into a static hash table structure
1167
1168 # determine table size
1169 for size, poly in SIZES:
1170 if size > len(data):
1171 poly = size + poly
1172 break
1173 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001174 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001175
Collin Winter6afaeb72007-08-03 17:06:41 +00001176 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001177
1178 table = [None] * size
1179
1180 mask = size-1
1181
1182 n = 0
1183
1184 hash = myhash
1185
1186 # initialize hash table
1187 for key, value in data:
1188 h = hash(key, magic)
1189 i = (~h) & mask
1190 v = table[i]
1191 if v is None:
1192 table[i] = value
1193 continue
1194 incr = (h ^ (h >> 3)) & mask;
1195 if not incr:
1196 incr = mask
1197 while 1:
1198 n = n + 1
1199 i = (i + incr) & mask
1200 v = table[i]
1201 if v is None:
1202 table[i] = value
1203 break
1204 incr = incr << 1
1205 if incr > mask:
1206 incr = incr ^ poly
1207
Collin Winter6afaeb72007-08-03 17:06:41 +00001208 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001209 self.collisions = n
1210
1211 for i in range(len(table)):
1212 if table[i] is None:
1213 table[i] = 0
1214
1215 self.data = Array(name + "_hash", table)
1216 self.magic = magic
1217 self.name = name
1218 self.size = size
1219 self.poly = poly
1220
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001221 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001222 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001223 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001224 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1225 file.write("#define %s_size %d\n" % (self.name, self.size))
1226 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1227
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001228# stuff to deal with arrays of unsigned integers
1229
1230class Array:
1231
1232 def __init__(self, name, data):
1233 self.name = name
1234 self.data = data
1235
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001236 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001237 # write data to file, as a C array
1238 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001239 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001240 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001241 file.write("static ")
1242 if size == 1:
1243 file.write("unsigned char")
1244 elif size == 2:
1245 file.write("unsigned short")
1246 else:
1247 file.write("unsigned int")
1248 file.write(" " + self.name + "[] = {\n")
1249 if self.data:
1250 s = " "
1251 for item in self.data:
1252 i = str(item) + ", "
1253 if len(s) + len(i) > 78:
1254 file.write(s + "\n")
1255 s = " " + i
1256 else:
1257 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001258 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001259 file.write(s + "\n")
1260 file.write("};\n\n")
1261
1262def getsize(data):
1263 # return smallest possible integer size for the given array
1264 maxdata = max(data)
1265 if maxdata < 256:
1266 return 1
1267 elif maxdata < 65536:
1268 return 2
1269 else:
1270 return 4
1271
Tim Peters21013482000-09-25 07:13:41 +00001272def splitbins(t, trace=0):
1273 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1274
1275 t is a sequence of ints. This function can be useful to save space if
1276 many of the ints are the same. t1 and t2 are lists of ints, and shift
1277 is an int, chosen to minimize the combined size of t1 and t2 (in C
1278 code), and where for each i in range(len(t)),
1279 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1280 where mask is a bitmask isolating the last "shift" bits.
1281
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001282 If optional arg trace is non-zero (default zero), progress info
1283 is printed to sys.stderr. The higher the value, the more info
1284 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001285 """
1286
Tim Peters21013482000-09-25 07:13:41 +00001287 if trace:
1288 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001289 print("%d+%d bins at shift %d; %d bytes" % (
1290 len(t1), len(t2), shift, bytes), file=sys.stderr)
1291 print("Size of original table:", len(t)*getsize(t), \
1292 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001293 n = len(t)-1 # last valid index
1294 maxshift = 0 # the most we can shift n and still have something left
1295 if n > 0:
1296 while n >> 1:
1297 n >>= 1
1298 maxshift += 1
1299 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001300 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001301 t = tuple(t) # so slices can be dict keys
1302 for shift in range(maxshift + 1):
1303 t1 = []
1304 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001305 size = 2**shift
1306 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001307 for i in range(0, len(t), size):
1308 bin = t[i:i+size]
1309 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001310 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001311 index = len(t2)
1312 bincache[bin] = index
1313 t2.extend(bin)
1314 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001315 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001316 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001317 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001318 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001319 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001320 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001321 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001322 t1, t2, shift = best
1323 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001324 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001325 dump(t1, t2, shift, bytes)
1326 if __debug__:
1327 # exhaustively verify that the decomposition is correct
1328 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001329 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001330 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1331 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001332
1333if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001334 maketables(1)