blob: d83cf634954d2a439fa7754a1eb07664ebd01e09 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
Benjamin Peterson71f660e2012-02-20 22:24:29 -050040UNIDATA_VERSION = "6.1.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041UNICODE_DATA = "UnicodeData%s.txt"
42COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
43EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000044UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000045DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000047LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030048NAME_ALIASES = "NameAliases%s.txt"
49NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050051CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030052
53# Private Use Areas -- in planes 1, 15, 16
54PUA_1 = range(0xE000, 0xF900)
55PUA_15 = range(0xF0000, 0xFFFFE)
56PUA_16 = range(0x100000, 0x10FFFE)
57
58# we use this ranges of PUA_15 to store name aliases and named sequences
59NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050060NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000061
62old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000063
64CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
65 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
66 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
67 "So" ]
68
69BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
70 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
71 "ON" ]
72
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000073EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
74
Florent Xicluna806d8cf2010-03-30 19:34:18 +000075MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
76
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000077# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000078ALPHA_MASK = 0x01
79DECIMAL_MASK = 0x02
80DIGIT_MASK = 0x04
81LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083SPACE_MASK = 0x20
84TITLE_MASK = 0x40
85UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000086XID_START_MASK = 0x100
87XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000088PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050089NUMERIC_MASK = 0x800
90CASE_IGNORABLE_MASK = 0x1000
91CASED_MASK = 0x2000
92EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000093
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000094# these ranges need to match unicodedata.c:is_unified_ideograph
95cjk_ranges = [
96 ('3400', '4DB5'),
Benjamin Peterson71f660e2012-02-20 22:24:29 -050097 ('4E00', '9FCC'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000098 ('20000', '2A6D6'),
99 ('2A700', '2B734'),
100 ('2B740', '2B81D')
101]
102
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000103def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000104
Collin Winter6afaeb72007-08-03 17:06:41 +0000105 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000106
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000107 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000108 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109
Georg Brandl559e5d72008-06-11 18:37:52 +0000110 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000111
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000113 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000114 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000115 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000116 merge_old_version(version, unicode, old_unicode)
117
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000118 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000119 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000120 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000121
122# --------------------------------------------------------------------
123# unicode character properties
124
125def makeunicodedata(unicode, trace):
126
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000127 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000128 table = [dummy]
129 cache = {0: dummy}
130 index = [0] * len(unicode.chars)
131
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000132 FILE = "Modules/unicodedata_db.h"
133
Collin Winter6afaeb72007-08-03 17:06:41 +0000134 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000135
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000136 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000137
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000138 for char in unicode.chars:
139 record = unicode.table[char]
140 if record:
141 # extract database properties
142 category = CATEGORY_NAMES.index(record[2])
143 combining = int(record[3])
144 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
145 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000146 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000147 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000148 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000149 category, combining, bidirectional, mirrored, eastasianwidth,
150 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000151 )
152 # add entry to index and item tables
153 i = cache.get(item)
154 if i is None:
155 cache[item] = i = len(table)
156 table.append(item)
157 index[char] = i
158
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000159 # 2) decomposition data
160
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000161 decomp_data = [0]
162 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000163 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000165
Martin v. Löwis677bde22002-11-23 22:08:15 +0000166 comp_pairs = []
167 comp_first = [None] * len(unicode.chars)
168 comp_last = [None] * len(unicode.chars)
169
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 for char in unicode.chars:
171 record = unicode.table[char]
172 if record:
173 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000174 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000175 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000176 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 # prefix
178 if decomp[0][0] == "<":
179 prefix = decomp.pop(0)
180 else:
181 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000182 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 i = decomp_prefix.index(prefix)
184 except ValueError:
185 i = len(decomp_prefix)
186 decomp_prefix.append(prefix)
187 prefix = i
188 assert prefix < 256
189 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000190 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000191 # Collect NFC pairs
192 if not prefix and len(decomp) == 3 and \
193 char not in unicode.exclusions and \
194 unicode.table[decomp[1]][3] == "0":
195 p, l, r = decomp
196 comp_first[l] = 1
197 comp_last[r] = 1
198 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000199 try:
200 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000201 except ValueError:
202 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203 decomp_data.extend(decomp)
204 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000205 else:
206 i = 0
207 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000208
Martin v. Löwis677bde22002-11-23 22:08:15 +0000209 f = l = 0
210 comp_first_ranges = []
211 comp_last_ranges = []
212 prev_f = prev_l = None
213 for i in unicode.chars:
214 if comp_first[i] is not None:
215 comp_first[i] = f
216 f += 1
217 if prev_f is None:
218 prev_f = (i,i)
219 elif prev_f[1]+1 == i:
220 prev_f = prev_f[0],i
221 else:
222 comp_first_ranges.append(prev_f)
223 prev_f = (i,i)
224 if comp_last[i] is not None:
225 comp_last[i] = l
226 l += 1
227 if prev_l is None:
228 prev_l = (i,i)
229 elif prev_l[1]+1 == i:
230 prev_l = prev_l[0],i
231 else:
232 comp_last_ranges.append(prev_l)
233 prev_l = (i,i)
234 comp_first_ranges.append(prev_f)
235 comp_last_ranges.append(prev_l)
236 total_first = f
237 total_last = l
238
239 comp_data = [0]*(total_first*total_last)
240 for f,l,char in comp_pairs:
241 f = comp_first[f]
242 l = comp_last[l]
243 comp_data[f*total_last+l] = char
244
Collin Winter6afaeb72007-08-03 17:06:41 +0000245 print(len(table), "unique properties")
246 print(len(decomp_prefix), "unique decomposition prefixes")
247 print(len(decomp_data), "unique decomposition entries:", end=' ')
248 print(decomp_size, "bytes")
249 print(total_first, "first characters in NFC")
250 print(total_last, "last characters in NFC")
251 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000252
Collin Winter6afaeb72007-08-03 17:06:41 +0000253 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000254
Fred Drake9c685052000-10-26 03:56:46 +0000255 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000256 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
257 print(file=fp)
258 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
259 print("/* a list of unique database records */", file=fp)
260 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000261 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000262 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("};", file=fp)
264 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265
Collin Winter6afaeb72007-08-03 17:06:41 +0000266 print("/* Reindexing of NFC first characters. */", file=fp)
267 print("#define TOTAL_FIRST",total_first, file=fp)
268 print("#define TOTAL_LAST",total_last, file=fp)
269 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000270 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000271 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
273 print(" {0,0,0}", file=fp)
274 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000275 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
278 print(" {0,0,0}", file=fp)
279 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000280
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000281 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000282 # the support code moved into unicodedatabase.c
283
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print("/* string literals */", file=fp)
285 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000286 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000287 print(" \"%s\"," % name, file=fp)
288 print(" NULL", file=fp)
289 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print(" \"%s\"," % name, file=fp)
294 print(" NULL", file=fp)
295 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000298 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print(" \"%s\"," % name, file=fp)
300 print(" NULL", file=fp)
301 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print(" \"%s\"," % name, file=fp)
306 print(" NULL", file=fp)
307 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000308
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000309 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000310 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000311
Collin Winter6afaeb72007-08-03 17:06:41 +0000312 print("/* index tables for the database records */", file=fp)
313 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000314 Array("index1", index1).dump(fp, trace)
315 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000316
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000317 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000318 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000319
Collin Winter6afaeb72007-08-03 17:06:41 +0000320 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000321 Array("decomp_data", decomp_data).dump(fp, trace)
322
Collin Winter6afaeb72007-08-03 17:06:41 +0000323 print("/* index tables for the decomposition data */", file=fp)
324 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000325 Array("decomp_index1", index1).dump(fp, trace)
326 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000327
Martin v. Löwis677bde22002-11-23 22:08:15 +0000328 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("/* NFC pairs */", file=fp)
330 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000331 Array("comp_index", index).dump(fp, trace)
332 Array("comp_data", index2).dump(fp, trace)
333
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000334 # Generate delta tables for old versions
335 for version, table, normalization in unicode.changed:
336 cversion = version.replace(".","_")
337 records = [table[0]]
338 cache = {table[0]:0}
339 index = [0] * len(table)
340 for i, record in enumerate(table):
341 try:
342 index[i] = cache[record]
343 except KeyError:
344 index[i] = cache[record] = len(records)
345 records.append(record)
346 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000347 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000348 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000349 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
350 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000351 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
352 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
354 print("{", file=fp)
355 print("\tint index;", file=fp)
356 print("\tif (n >= 0x110000) index = 0;", file=fp)
357 print("\telse {", file=fp)
358 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
359 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
360 (cversion, shift, ((1<<shift)-1)), file=fp)
361 print("\t}", file=fp)
362 print("\treturn change_records_%s+index;" % cversion, file=fp)
363 print("}\n", file=fp)
364 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
365 print("{", file=fp)
366 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000367 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000368 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
369 print("\tdefault: return 0;", file=fp)
370 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000371
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000372 fp.close()
373
374# --------------------------------------------------------------------
375# unicode character type tables
376
377def makeunicodetype(unicode, trace):
378
379 FILE = "Objects/unicodetype_db.h"
380
Collin Winter6afaeb72007-08-03 17:06:41 +0000381 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000382
383 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000384 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000385 table = [dummy]
386 cache = {0: dummy}
387 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000388 numeric = {}
389 spaces = []
390 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500391 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392
393 for char in unicode.chars:
394 record = unicode.table[char]
395 if record:
396 # extract database properties
397 category = record[2]
398 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000399 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000400 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000402 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
403 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500404 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000405 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000406 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000407 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000408 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 if category == "Zs" or bidirectional in ("WS", "B", "S"):
410 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000411 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000412 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000413 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500414 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000415 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000416 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000417 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000418 if "XID_Start" in properties:
419 flags |= XID_START_MASK
420 if "XID_Continue" in properties:
421 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500422 if "Cased" in properties:
423 flags |= CASED_MASK
424 if "Case_Ignorable" in properties:
425 flags |= CASE_IGNORABLE_MASK
426 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500427 cf = unicode.case_folding.get(char, [char])
428 if record[12]:
429 upper = int(record[12], 16)
430 else:
431 upper = char
432 if record[13]:
433 lower = int(record[13], 16)
434 else:
435 lower = char
436 if record[14]:
437 title = int(record[14], 16)
438 else:
439 title = upper
440 if sc is None and cf != [lower]:
441 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500442 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500443 if upper == lower == title:
444 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500445 else:
446 upper = upper - char
447 lower = lower - char
448 title = title - char
449 assert (abs(upper) <= 2147483647 and
450 abs(lower) <= 2147483647 and
451 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000452 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500453 # This happens either when some character maps to more than one
454 # character in uppercase, lowercase, or titlecase or the
455 # casefolded version of the character is different from the
456 # lowercase. The extra characters are stored in a different
457 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500458 flags |= EXTENDED_CASE_MASK
459 lower = len(extra_casing) | (len(sc[0]) << 24)
460 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500461 if cf != sc[0]:
462 lower |= len(cf) << 20
463 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500464 upper = len(extra_casing) | (len(sc[2]) << 24)
465 extra_casing.extend(sc[2])
466 # Title is probably equal to upper.
467 if sc[1] == sc[2]:
468 title = upper
469 else:
470 title = len(extra_casing) | (len(sc[1]) << 24)
471 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000472 # decimal digit, integer digit
473 decimal = 0
474 if record[6]:
475 flags |= DECIMAL_MASK
476 decimal = int(record[6])
477 digit = 0
478 if record[7]:
479 flags |= DIGIT_MASK
480 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000481 if record[8]:
482 flags |= NUMERIC_MASK
483 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000484 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000485 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000486 )
487 # add entry to index and item tables
488 i = cache.get(item)
489 if i is None:
490 cache[item] = i = len(table)
491 table.append(item)
492 index[char] = i
493
Collin Winter6afaeb72007-08-03 17:06:41 +0000494 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000495 print(sum(map(len, numeric.values())), "numeric code points")
496 print(len(spaces), "whitespace code points")
497 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500498 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000499
Collin Winter6afaeb72007-08-03 17:06:41 +0000500 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000501
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000502 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000503 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
504 print(file=fp)
505 print("/* a list of unique character type descriptors */", file=fp)
506 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000507 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000508 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
509 print("};", file=fp)
510 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000511
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500512 print("/* extended case mappings */", file=fp)
513 print(file=fp)
514 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
515 for c in extra_casing:
516 print(" %d," % c, file=fp)
517 print("};", file=fp)
518 print(file=fp)
519
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000520 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000521 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000522
Collin Winter6afaeb72007-08-03 17:06:41 +0000523 print("/* type indexes */", file=fp)
524 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000525 Array("index1", index1).dump(fp, trace)
526 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000527
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000528 # Generate code for _PyUnicode_ToNumeric()
529 numeric_items = sorted(numeric.items())
530 print('/* Returns the numeric value as double for Unicode characters', file=fp)
531 print(' * having this property, -1.0 otherwise.', file=fp)
532 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000533 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000534 print('{', file=fp)
535 print(' switch (ch) {', file=fp)
536 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000537 # Turn text into float literals
538 parts = value.split('/')
539 parts = [repr(float(part)) for part in parts]
540 value = '/'.join(parts)
541
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000542 codepoints.sort()
543 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000544 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000545 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000546 print(' }', file=fp)
547 print(' return -1.0;', file=fp)
548 print('}', file=fp)
549 print(file=fp)
550
551 # Generate code for _PyUnicode_IsWhitespace()
552 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
553 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
554 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000555 print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000556 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000557 print(' switch (ch) {', file=fp)
558
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000559 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000560 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562
563 print(' }', file=fp)
564 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000565 print('}', file=fp)
566 print(file=fp)
567
568 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000569 print("/* Returns 1 for Unicode characters having the line break", file=fp)
570 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
571 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000572 print(" */", file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000573 print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000574 print('{', file=fp)
575 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000576 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000577 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579
580 print(' }', file=fp)
581 print(' return 0;', file=fp)
582 print('}', file=fp)
583 print(file=fp)
584
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000585 fp.close()
586
587# --------------------------------------------------------------------
588# unicode name database
589
590def makeunicodename(unicode, trace):
591
592 FILE = "Modules/unicodename_db.h"
593
Collin Winter6afaeb72007-08-03 17:06:41 +0000594 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000595
596 # collect names
597 names = [None] * len(unicode.chars)
598
599 for char in unicode.chars:
600 record = unicode.table[char]
601 if record:
602 name = record[1].strip()
603 if name and name[0] != "<":
604 names[char] = name + chr(0)
605
Georg Brandl559e5d72008-06-11 18:37:52 +0000606 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000607
608 # collect unique words from names (note that we differ between
609 # words inside a sentence, and words ending a sentence. the
610 # latter includes the trailing null byte.
611
612 words = {}
613 n = b = 0
614 for char in unicode.chars:
615 name = names[char]
616 if name:
617 w = name.split()
618 b = b + len(name)
619 n = n + len(w)
620 for w in w:
621 l = words.get(w)
622 if l:
623 l.append(None)
624 else:
625 words[w] = [len(words)]
626
Collin Winter6afaeb72007-08-03 17:06:41 +0000627 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000628
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000629 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000630
Martin v. Löwis97225da2002-11-24 23:05:09 +0000631 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000632 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000633 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000634 return -len(alist), aword
635 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000636
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000637 # figure out how many phrasebook escapes we need
638 escapes = 0
639 while escapes * 256 < len(wordlist):
640 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000641 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000642
643 short = 256 - escapes
644
645 assert short > 0
646
Collin Winter6afaeb72007-08-03 17:06:41 +0000647 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000648
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000649 # statistics
650 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000651 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000652 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000653 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000654
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655 # pick the most commonly used words, and sort the rest on falling
656 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000657
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000659 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660 wordlist.extend(wordtail)
661
662 # generate lexicon from words
663
664 lexicon_offset = [0]
665 lexicon = ""
666 words = {}
667
668 # build a lexicon string
669 offset = 0
670 for w, x in wordlist:
671 # encoding: bit 7 indicates last character in word (chr(128)
672 # indicates the last character in an entire string)
673 ww = w[:-1] + chr(ord(w[-1])+128)
674 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000675 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000676 if o < 0:
677 o = offset
678 lexicon = lexicon + ww
679 offset = offset + len(w)
680 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000681 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000683 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000684
685 # generate phrasebook from names and lexicon
686 phrasebook = [0]
687 phrasebook_offset = [0] * len(unicode.chars)
688 for char in unicode.chars:
689 name = names[char]
690 if name:
691 w = name.split()
692 phrasebook_offset[char] = len(phrasebook)
693 for w in w:
694 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000695 if i < short:
696 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000697 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000698 # store as two bytes
699 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000700 phrasebook.append(i&255)
701
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000702 assert getsize(phrasebook) == 1
703
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000704 #
705 # unicode name hash table
706
707 # extract names
708 data = []
709 for char in unicode.chars:
710 record = unicode.table[char]
711 if record:
712 name = record[1].strip()
713 if name and name[0] != "<":
714 data.append((name, char))
715
716 # the magic number 47 was chosen to minimize the number of
717 # collisions on the current data set. if you like, change it
718 # and see what happens...
719
720 codehash = Hash("code", data, 47)
721
Collin Winter6afaeb72007-08-03 17:06:41 +0000722 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000723
724 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000725 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
726 print(file=fp)
727 print("#define NAME_MAXLEN", 256, file=fp)
728 print(file=fp)
729 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000730 Array("lexicon", lexicon).dump(fp, trace)
731 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000732
733 # split decomposition index table
734 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
735
Collin Winter6afaeb72007-08-03 17:06:41 +0000736 print("/* code->name phrasebook */", file=fp)
737 print("#define phrasebook_shift", shift, file=fp)
738 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000739
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000740 Array("phrasebook", phrasebook).dump(fp, trace)
741 Array("phrasebook_offset1", offset1).dump(fp, trace)
742 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000743
Collin Winter6afaeb72007-08-03 17:06:41 +0000744 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000745 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000746
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300747 print(file=fp)
748 print('static const unsigned int aliases_start = %#x;' %
749 NAME_ALIASES_START, file=fp)
750 print('static const unsigned int aliases_end = %#x;' %
751 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
752
753 print('static const unsigned int name_aliases[] = {', file=fp)
754 for name, codepoint in unicode.aliases:
755 print(' 0x%04X,' % codepoint, file=fp)
756 print('};', file=fp)
757
758 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
759 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
760 # sequences or sequences with non-BMP chars are added.
761 # unicodedata_lookup should be adapted too.
762 print(dedent("""
763 typedef struct NamedSequence {
764 int seqlen;
765 Py_UCS2 seq[4];
766 } named_sequence;
767 """), file=fp)
768
769 print('static const unsigned int named_sequences_start = %#x;' %
770 NAMED_SEQUENCES_START, file=fp)
771 print('static const unsigned int named_sequences_end = %#x;' %
772 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
773
774 print('static const named_sequence named_sequences[] = {', file=fp)
775 for name, sequence in unicode.named_sequences:
776 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
777 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
778 print('};', file=fp)
779
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000780 fp.close()
781
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000782
783def merge_old_version(version, new, old):
784 # Changes to exclusion file not implemented yet
785 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000786 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000787
788 # In these change records, 0xFF means "no change"
789 bidir_changes = [0xFF]*0x110000
790 category_changes = [0xFF]*0x110000
791 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000792 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000793 # In numeric data, 0 means "no change",
794 # -1 means "did not have a numeric value
795 numeric_changes = [0] * 0x110000
796 # normalization_changes is a list of key-value pairs
797 normalization_changes = []
798 for i in range(0x110000):
799 if new.table[i] is None:
800 # Characters unassigned in the new version ought to
801 # be unassigned in the old one
802 assert old.table[i] is None
803 continue
804 # check characters unassigned in the old version
805 if old.table[i] is None:
806 # category 0 is "unassigned"
807 category_changes[i] = 0
808 continue
809 # check characters that differ
810 if old.table[i] != new.table[i]:
811 for k in range(len(old.table[i])):
812 if old.table[i][k] != new.table[i][k]:
813 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300814 if k == 1 and i in PUA_15:
815 # the name is not set in the old.table, but in the
816 # new.table we are using it for aliases and named seq
817 assert value == ''
818 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000819 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
820 category_changes[i] = CATEGORY_NAMES.index(value)
821 elif k == 4:
822 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
823 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
824 elif k == 5:
825 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
826 # We assume that all normalization changes are in 1:1 mappings
827 assert " " not in value
828 normalization_changes.append((i, value))
829 elif k == 6:
830 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
831 # we only support changes where the old value is a single digit
832 assert value in "0123456789"
833 decimal_changes[i] = int(value)
834 elif k == 8:
835 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
836 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000837 if not value:
838 numeric_changes[i] = -1
839 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000840 numeric_changes[i] = float(value)
841 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000842 elif k == 9:
843 if value == 'Y':
844 mirrored_changes[i] = '1'
845 else:
846 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000847 elif k == 11:
848 # change to ISO comment, ignore
849 pass
850 elif k == 12:
851 # change to simple uppercase mapping; ignore
852 pass
853 elif k == 13:
854 # change to simple lowercase mapping; ignore
855 pass
856 elif k == 14:
857 # change to simple titlecase mapping; ignore
858 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000859 elif k == 16:
860 # derived property changes; not yet
861 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000862 elif k == 17:
863 # normalization quickchecks are not performed
864 # for older versions
865 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000866 else:
867 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000868 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000869 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000870 decimal_changes, mirrored_changes,
871 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000872 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000873
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000874def open_data(template, version):
875 local = template % ('-'+version,)
876 if not os.path.exists(local):
877 import urllib.request
878 if version == '3.2.0':
879 # irregular url structure
880 url = 'http://www.unicode.org/Public/3.2-Update/' + local
881 else:
882 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
883 urllib.request.urlretrieve(url, filename=local)
884 if local.endswith('.txt'):
885 return open(local, encoding='utf-8')
886 else:
887 # Unihan.zip
888 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000889
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000890# --------------------------------------------------------------------
891# the following support code is taken from the unidb utilities
892# Copyright (c) 1999-2000 by Secret Labs AB
893
894# load a unicode-data file from disk
895
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000896class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000897 # Record structure:
898 # [ID, name, category, combining, bidi, decomp, (6)
899 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
900 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
901 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000902
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000903 def __init__(self, version,
904 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000905 expand=1,
906 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000907 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000908 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300909 with open_data(UNICODE_DATA, version) as file:
910 while 1:
911 s = file.readline()
912 if not s:
913 break
914 s = s.strip().split(";")
915 char = int(s[0], 16)
916 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000917
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000918 cjk_ranges_found = []
919
Martin v. Löwis97225da2002-11-24 23:05:09 +0000920 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000921 if expand:
922 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000923 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000924 s = table[i]
925 if s:
926 if s[1][-6:] == "First>":
927 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000928 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000929 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000930 if s[1].startswith("<CJK Ideograph"):
931 cjk_ranges_found.append((field[0],
932 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000933 s[1] = ""
934 field = None
935 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000936 f2 = field[:]
937 f2[0] = "%X" % i
938 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000939 if cjk_check and cjk_ranges != cjk_ranges_found:
940 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000941
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000942 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000943 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000944 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000945 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000946
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300947 # check for name aliases and named sequences, see #12753
948 # aliases and named sequences are not in 3.2.0
949 if version != '3.2.0':
950 self.aliases = []
951 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
952 # in order to take advantage of the compression and lookup
953 # algorithms used for the other characters
954 pua_index = NAME_ALIASES_START
955 with open_data(NAME_ALIASES, version) as file:
956 for s in file:
957 s = s.strip()
958 if not s or s.startswith('#'):
959 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500960 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300961 char = int(char, 16)
962 self.aliases.append((name, char))
963 # also store the name in the PUA 1
964 self.table[pua_index][1] = name
965 pua_index += 1
966 assert pua_index - NAME_ALIASES_START == len(self.aliases)
967
968 self.named_sequences = []
969 # store named seqences in the PUA 1, in range U+F0100..,
970 # in order to take advantage of the compression and lookup
971 # algorithms used for the other characters.
972
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500973 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300974 pua_index = NAMED_SEQUENCES_START
975 with open_data(NAMED_SEQUENCES, version) as file:
976 for s in file:
977 s = s.strip()
978 if not s or s.startswith('#'):
979 continue
980 name, chars = s.split(';')
981 chars = tuple(int(char, 16) for char in chars.split())
982 # check that the structure defined in makeunicodename is OK
983 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
984 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
985 "the NamedSequence struct and in unicodedata_lookup")
986 self.named_sequences.append((name, chars))
987 # also store these in the PUA 1
988 self.table[pua_index][1] = name
989 pua_index += 1
990 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
991
Martin v. Löwis677bde22002-11-23 22:08:15 +0000992 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300993 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
994 for s in file:
995 s = s.strip()
996 if not s:
997 continue
998 if s[0] == '#':
999 continue
1000 char = int(s.split()[0],16)
1001 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001002
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001003 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001004 with open_data(EASTASIAN_WIDTH, version) as file:
1005 for s in file:
1006 s = s.strip()
1007 if not s:
1008 continue
1009 if s[0] == '#':
1010 continue
1011 s = s.split()[0].split(';')
1012 if '..' in s[0]:
1013 first, last = [int(c, 16) for c in s[0].split('..')]
1014 chars = list(range(first, last+1))
1015 else:
1016 chars = [int(s[0], 16)]
1017 for char in chars:
1018 widths[char] = s[1]
1019
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001020 for i in range(0, 0x110000):
1021 if table[i] is not None:
1022 table[i].append(widths[i])
1023
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001024 for i in range(0, 0x110000):
1025 if table[i] is not None:
1026 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001027
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001028 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1029 for s in file:
1030 s = s.split('#', 1)[0].strip()
1031 if not s:
1032 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001033
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001034 r, p = s.split(";")
1035 r = r.strip()
1036 p = p.strip()
1037 if ".." in r:
1038 first, last = [int(c, 16) for c in r.split('..')]
1039 chars = list(range(first, last+1))
1040 else:
1041 chars = [int(r, 16)]
1042 for char in chars:
1043 if table[char]:
1044 # Some properties (e.g. Default_Ignorable_Code_Point)
1045 # apply to unassigned code points; ignore them
1046 table[char][-1].add(p)
1047
1048 with open_data(LINE_BREAK, version) as file:
1049 for s in file:
1050 s = s.partition('#')[0]
1051 s = [i.strip() for i in s.split(';')]
1052 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1053 continue
1054 if '..' not in s[0]:
1055 first = last = int(s[0], 16)
1056 else:
1057 first, last = [int(c, 16) for c in s[0].split('..')]
1058 for char in range(first, last+1):
1059 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001060
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001061 # We only want the quickcheck properties
1062 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1063 # Yes is the default, hence only N and M occur
1064 # In 3.2.0, the format was different (NF?_NO)
1065 # The parsing will incorrectly determine these as
1066 # "yes", however, unicodedata.c will not perform quickchecks
1067 # for older versions, and no delta records will be created.
1068 quickchecks = [0] * 0x110000
1069 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001070 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1071 for s in file:
1072 if '#' in s:
1073 s = s[:s.index('#')]
1074 s = [i.strip() for i in s.split(';')]
1075 if len(s) < 2 or s[1] not in qc_order:
1076 continue
1077 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1078 quickcheck_shift = qc_order.index(s[1])*2
1079 quickcheck <<= quickcheck_shift
1080 if '..' not in s[0]:
1081 first = last = int(s[0], 16)
1082 else:
1083 first, last = [int(c, 16) for c in s[0].split('..')]
1084 for char in range(first, last+1):
1085 assert not (quickchecks[char]>>quickcheck_shift)&3
1086 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001087 for i in range(0, 0x110000):
1088 if table[i] is not None:
1089 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001090
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001091 with open_data(UNIHAN, version) as file:
1092 zip = zipfile.ZipFile(file)
1093 if version == '3.2.0':
1094 data = zip.open('Unihan-3.2.0.txt').read()
1095 else:
1096 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001097 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001098 if not line.startswith('U+'):
1099 continue
1100 code, tag, value = line.split(None, 3)[:3]
1101 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1102 'kOtherNumeric'):
1103 continue
1104 value = value.strip().replace(',', '')
1105 i = int(code[2:], 16)
1106 # Patch the numeric field
1107 if table[i] is not None:
1108 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001109 sc = self.special_casing = {}
1110 with open_data(SPECIAL_CASING, version) as file:
1111 for s in file:
1112 s = s[:-1].split('#', 1)[0]
1113 if not s:
1114 continue
1115 data = s.split("; ")
1116 if data[4]:
1117 # We ignore all conditionals (since they depend on
1118 # languages) except for one, which is hardcoded. See
1119 # handle_capital_sigma in unicodeobject.c.
1120 continue
1121 c = int(data[0], 16)
1122 lower = [int(char, 16) for char in data[1].split()]
1123 title = [int(char, 16) for char in data[2].split()]
1124 upper = [int(char, 16) for char in data[3].split()]
1125 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001126 cf = self.case_folding = {}
1127 if version != '3.2.0':
1128 with open_data(CASE_FOLDING, version) as file:
1129 for s in file:
1130 s = s[:-1].split('#', 1)[0]
1131 if not s:
1132 continue
1133 data = s.split("; ")
1134 if data[1] in "CF":
1135 c = int(data[0], 16)
1136 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001137
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001138 def uselatin1(self):
1139 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001140 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001141
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001142# hash table tools
1143
1144# this is a straight-forward reimplementation of Python's built-in
1145# dictionary type, using a static data structure, and a custom string
1146# hash algorithm.
1147
1148def myhash(s, magic):
1149 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001150 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001151 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001152 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001153 if ix:
1154 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1155 return h
1156
1157SIZES = [
1158 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1159 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1160 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1161 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1162]
1163
1164class Hash:
1165 def __init__(self, name, data, magic):
1166 # turn a (key, value) list into a static hash table structure
1167
1168 # determine table size
1169 for size, poly in SIZES:
1170 if size > len(data):
1171 poly = size + poly
1172 break
1173 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001174 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001175
Collin Winter6afaeb72007-08-03 17:06:41 +00001176 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001177
1178 table = [None] * size
1179
1180 mask = size-1
1181
1182 n = 0
1183
1184 hash = myhash
1185
1186 # initialize hash table
1187 for key, value in data:
1188 h = hash(key, magic)
1189 i = (~h) & mask
1190 v = table[i]
1191 if v is None:
1192 table[i] = value
1193 continue
1194 incr = (h ^ (h >> 3)) & mask;
1195 if not incr:
1196 incr = mask
1197 while 1:
1198 n = n + 1
1199 i = (i + incr) & mask
1200 v = table[i]
1201 if v is None:
1202 table[i] = value
1203 break
1204 incr = incr << 1
1205 if incr > mask:
1206 incr = incr ^ poly
1207
Collin Winter6afaeb72007-08-03 17:06:41 +00001208 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001209 self.collisions = n
1210
1211 for i in range(len(table)):
1212 if table[i] is None:
1213 table[i] = 0
1214
1215 self.data = Array(name + "_hash", table)
1216 self.magic = magic
1217 self.name = name
1218 self.size = size
1219 self.poly = poly
1220
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001221 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001222 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001223 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001224 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1225 file.write("#define %s_size %d\n" % (self.name, self.size))
1226 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1227
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001228# stuff to deal with arrays of unsigned integers
1229
1230class Array:
1231
1232 def __init__(self, name, data):
1233 self.name = name
1234 self.data = data
1235
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001236 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001237 # write data to file, as a C array
1238 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001239 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001240 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001241 file.write("static ")
1242 if size == 1:
1243 file.write("unsigned char")
1244 elif size == 2:
1245 file.write("unsigned short")
1246 else:
1247 file.write("unsigned int")
1248 file.write(" " + self.name + "[] = {\n")
1249 if self.data:
1250 s = " "
1251 for item in self.data:
1252 i = str(item) + ", "
1253 if len(s) + len(i) > 78:
1254 file.write(s + "\n")
1255 s = " " + i
1256 else:
1257 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001258 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001259 file.write(s + "\n")
1260 file.write("};\n\n")
1261
1262def getsize(data):
1263 # return smallest possible integer size for the given array
1264 maxdata = max(data)
1265 if maxdata < 256:
1266 return 1
1267 elif maxdata < 65536:
1268 return 2
1269 else:
1270 return 4
1271
Tim Peters21013482000-09-25 07:13:41 +00001272def splitbins(t, trace=0):
1273 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1274
1275 t is a sequence of ints. This function can be useful to save space if
1276 many of the ints are the same. t1 and t2 are lists of ints, and shift
1277 is an int, chosen to minimize the combined size of t1 and t2 (in C
1278 code), and where for each i in range(len(t)),
1279 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1280 where mask is a bitmask isolating the last "shift" bits.
1281
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001282 If optional arg trace is non-zero (default zero), progress info
1283 is printed to sys.stderr. The higher the value, the more info
1284 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001285 """
1286
Tim Peters21013482000-09-25 07:13:41 +00001287 if trace:
1288 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001289 print("%d+%d bins at shift %d; %d bytes" % (
1290 len(t1), len(t2), shift, bytes), file=sys.stderr)
1291 print("Size of original table:", len(t)*getsize(t), \
1292 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001293 n = len(t)-1 # last valid index
1294 maxshift = 0 # the most we can shift n and still have something left
1295 if n > 0:
1296 while n >> 1:
1297 n >>= 1
1298 maxshift += 1
1299 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001300 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001301 t = tuple(t) # so slices can be dict keys
1302 for shift in range(maxshift + 1):
1303 t1 = []
1304 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001305 size = 2**shift
1306 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001307 for i in range(0, len(t), size):
1308 bin = t[i:i+size]
1309 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001310 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001311 index = len(t2)
1312 bincache[bin] = index
1313 t2.extend(bin)
1314 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001315 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001316 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001317 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001318 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001319 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001320 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001321 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001322 t1, t2, shift = best
1323 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001324 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001325 dump(t1, t2, shift, bytes)
1326 if __debug__:
1327 # exhaustively verify that the decomposition is correct
1328 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001329 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001330 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1331 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001332
1333if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001334 maketables(1)