blob: 0aebdcae2c3e89b3cde04176221903686d66da21 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
Benjamin Peterson94d08d92013-10-10 17:24:45 -040044UNIDATA_VERSION = "6.3.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000045UNICODE_DATA = "UnicodeData%s.txt"
46COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
47EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000048UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000049DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000050DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000051LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030052NAME_ALIASES = "NameAliases%s.txt"
53NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050054SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050055CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030056
57# Private Use Areas -- in planes 1, 15, 16
58PUA_1 = range(0xE000, 0xF900)
59PUA_15 = range(0xF0000, 0xFFFFE)
60PUA_16 = range(0x100000, 0x10FFFE)
61
62# we use this ranges of PUA_15 to store name aliases and named sequences
63NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050064NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000065
66old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000067
68CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
69 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
70 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
71 "So" ]
72
73BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
74 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040075 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000076
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000077EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
78
Florent Xicluna806d8cf2010-03-30 19:34:18 +000079MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
80
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000081# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000082ALPHA_MASK = 0x01
83DECIMAL_MASK = 0x02
84DIGIT_MASK = 0x04
85LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000086LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000087SPACE_MASK = 0x20
88TITLE_MASK = 0x40
89UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000090XID_START_MASK = 0x100
91XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000092PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050093NUMERIC_MASK = 0x800
94CASE_IGNORABLE_MASK = 0x1000
95CASED_MASK = 0x2000
96EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000097
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000098# these ranges need to match unicodedata.c:is_unified_ideograph
99cjk_ranges = [
100 ('3400', '4DB5'),
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500101 ('4E00', '9FCC'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000102 ('20000', '2A6D6'),
103 ('2A700', '2B734'),
104 ('2B740', '2B81D')
105]
106
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000107def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000108
Collin Winter6afaeb72007-08-03 17:06:41 +0000109 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000110
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000111 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000112 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000113
Georg Brandl559e5d72008-06-11 18:37:52 +0000114 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000115
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000116 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000117 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000118 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000119 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000120 merge_old_version(version, unicode, old_unicode)
121
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000122 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000123 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000124 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000125
126# --------------------------------------------------------------------
127# unicode character properties
128
129def makeunicodedata(unicode, trace):
130
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000131 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000132 table = [dummy]
133 cache = {0: dummy}
134 index = [0] * len(unicode.chars)
135
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000136 FILE = "Modules/unicodedata_db.h"
137
Collin Winter6afaeb72007-08-03 17:06:41 +0000138 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000139
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000140 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000142 for char in unicode.chars:
143 record = unicode.table[char]
144 if record:
145 # extract database properties
146 category = CATEGORY_NAMES.index(record[2])
147 combining = int(record[3])
148 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
149 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000150 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000151 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000152 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000153 category, combining, bidirectional, mirrored, eastasianwidth,
154 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000155 )
156 # add entry to index and item tables
157 i = cache.get(item)
158 if i is None:
159 cache[item] = i = len(table)
160 table.append(item)
161 index[char] = i
162
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000163 # 2) decomposition data
164
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000165 decomp_data = [0]
166 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000167 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169
Martin v. Löwis677bde22002-11-23 22:08:15 +0000170 comp_pairs = []
171 comp_first = [None] * len(unicode.chars)
172 comp_last = [None] * len(unicode.chars)
173
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000174 for char in unicode.chars:
175 record = unicode.table[char]
176 if record:
177 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000178 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000179 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000180 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000181 # prefix
182 if decomp[0][0] == "<":
183 prefix = decomp.pop(0)
184 else:
185 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000186 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000187 i = decomp_prefix.index(prefix)
188 except ValueError:
189 i = len(decomp_prefix)
190 decomp_prefix.append(prefix)
191 prefix = i
192 assert prefix < 256
193 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000194 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000195 # Collect NFC pairs
196 if not prefix and len(decomp) == 3 and \
197 char not in unicode.exclusions and \
198 unicode.table[decomp[1]][3] == "0":
199 p, l, r = decomp
200 comp_first[l] = 1
201 comp_last[r] = 1
202 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203 try:
204 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000205 except ValueError:
206 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207 decomp_data.extend(decomp)
208 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000209 else:
210 i = 0
211 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000212
Martin v. Löwis677bde22002-11-23 22:08:15 +0000213 f = l = 0
214 comp_first_ranges = []
215 comp_last_ranges = []
216 prev_f = prev_l = None
217 for i in unicode.chars:
218 if comp_first[i] is not None:
219 comp_first[i] = f
220 f += 1
221 if prev_f is None:
222 prev_f = (i,i)
223 elif prev_f[1]+1 == i:
224 prev_f = prev_f[0],i
225 else:
226 comp_first_ranges.append(prev_f)
227 prev_f = (i,i)
228 if comp_last[i] is not None:
229 comp_last[i] = l
230 l += 1
231 if prev_l is None:
232 prev_l = (i,i)
233 elif prev_l[1]+1 == i:
234 prev_l = prev_l[0],i
235 else:
236 comp_last_ranges.append(prev_l)
237 prev_l = (i,i)
238 comp_first_ranges.append(prev_f)
239 comp_last_ranges.append(prev_l)
240 total_first = f
241 total_last = l
242
243 comp_data = [0]*(total_first*total_last)
244 for f,l,char in comp_pairs:
245 f = comp_first[f]
246 l = comp_last[l]
247 comp_data[f*total_last+l] = char
248
Collin Winter6afaeb72007-08-03 17:06:41 +0000249 print(len(table), "unique properties")
250 print(len(decomp_prefix), "unique decomposition prefixes")
251 print(len(decomp_data), "unique decomposition entries:", end=' ')
252 print(decomp_size, "bytes")
253 print(total_first, "first characters in NFC")
254 print(total_last, "last characters in NFC")
255 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000256
Collin Winter6afaeb72007-08-03 17:06:41 +0000257 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000258
Fred Drake9c685052000-10-26 03:56:46 +0000259 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
261 print(file=fp)
262 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
263 print("/* a list of unique database records */", file=fp)
264 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000265 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000266 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000267 print("};", file=fp)
268 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000269
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("/* Reindexing of NFC first characters. */", file=fp)
271 print("#define TOTAL_FIRST",total_first, file=fp)
272 print("#define TOTAL_LAST",total_last, file=fp)
273 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000274 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000275 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000276 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
277 print(" {0,0,0}", file=fp)
278 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000279 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000280 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000281 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
282 print(" {0,0,0}", file=fp)
283 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000284
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000285 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000286 # the support code moved into unicodedatabase.c
287
Collin Winter6afaeb72007-08-03 17:06:41 +0000288 print("/* string literals */", file=fp)
289 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000290 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print(" \"%s\"," % name, file=fp)
292 print(" NULL", file=fp)
293 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000294
Collin Winter6afaeb72007-08-03 17:06:41 +0000295 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print(" \"%s\"," % name, file=fp)
298 print(" NULL", file=fp)
299 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000300
Collin Winter6afaeb72007-08-03 17:06:41 +0000301 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000302 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print(" \"%s\"," % name, file=fp)
304 print(" NULL", file=fp)
305 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000306
Collin Winter6afaeb72007-08-03 17:06:41 +0000307 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000308 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print(" \"%s\"," % name, file=fp)
310 print(" NULL", file=fp)
311 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000312
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000313 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000314 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000315
Collin Winter6afaeb72007-08-03 17:06:41 +0000316 print("/* index tables for the database records */", file=fp)
317 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000318 Array("index1", index1).dump(fp, trace)
319 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000320
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000321 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000322 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000323
Collin Winter6afaeb72007-08-03 17:06:41 +0000324 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000325 Array("decomp_data", decomp_data).dump(fp, trace)
326
Collin Winter6afaeb72007-08-03 17:06:41 +0000327 print("/* index tables for the decomposition data */", file=fp)
328 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000329 Array("decomp_index1", index1).dump(fp, trace)
330 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000331
Martin v. Löwis677bde22002-11-23 22:08:15 +0000332 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000333 print("/* NFC pairs */", file=fp)
334 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000335 Array("comp_index", index).dump(fp, trace)
336 Array("comp_data", index2).dump(fp, trace)
337
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000338 # Generate delta tables for old versions
339 for version, table, normalization in unicode.changed:
340 cversion = version.replace(".","_")
341 records = [table[0]]
342 cache = {table[0]:0}
343 index = [0] * len(table)
344 for i, record in enumerate(table):
345 try:
346 index[i] = cache[record]
347 except KeyError:
348 index[i] = cache[record] = len(records)
349 records.append(record)
350 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000351 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
354 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
356 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000357 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
358 print("{", file=fp)
359 print("\tint index;", file=fp)
360 print("\tif (n >= 0x110000) index = 0;", file=fp)
361 print("\telse {", file=fp)
362 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
363 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
364 (cversion, shift, ((1<<shift)-1)), file=fp)
365 print("\t}", file=fp)
366 print("\treturn change_records_%s+index;" % cversion, file=fp)
367 print("}\n", file=fp)
368 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
369 print("{", file=fp)
370 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000371 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000372 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
373 print("\tdefault: return 0;", file=fp)
374 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000375
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000376 fp.close()
377
378# --------------------------------------------------------------------
379# unicode character type tables
380
381def makeunicodetype(unicode, trace):
382
383 FILE = "Objects/unicodetype_db.h"
384
Collin Winter6afaeb72007-08-03 17:06:41 +0000385 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000386
387 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000388 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389 table = [dummy]
390 cache = {0: dummy}
391 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000392 numeric = {}
393 spaces = []
394 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500395 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000396
397 for char in unicode.chars:
398 record = unicode.table[char]
399 if record:
400 # extract database properties
401 category = record[2]
402 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000403 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000404 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000405 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
407 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500408 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000410 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000411 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000412 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000413 if category == "Zs" or bidirectional in ("WS", "B", "S"):
414 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000415 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000416 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000417 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500418 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000419 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000420 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000421 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000422 if "XID_Start" in properties:
423 flags |= XID_START_MASK
424 if "XID_Continue" in properties:
425 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500426 if "Cased" in properties:
427 flags |= CASED_MASK
428 if "Case_Ignorable" in properties:
429 flags |= CASE_IGNORABLE_MASK
430 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500431 cf = unicode.case_folding.get(char, [char])
432 if record[12]:
433 upper = int(record[12], 16)
434 else:
435 upper = char
436 if record[13]:
437 lower = int(record[13], 16)
438 else:
439 lower = char
440 if record[14]:
441 title = int(record[14], 16)
442 else:
443 title = upper
444 if sc is None and cf != [lower]:
445 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500446 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500447 if upper == lower == title:
448 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500449 else:
450 upper = upper - char
451 lower = lower - char
452 title = title - char
453 assert (abs(upper) <= 2147483647 and
454 abs(lower) <= 2147483647 and
455 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000456 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500457 # This happens either when some character maps to more than one
458 # character in uppercase, lowercase, or titlecase or the
459 # casefolded version of the character is different from the
460 # lowercase. The extra characters are stored in a different
461 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500462 flags |= EXTENDED_CASE_MASK
463 lower = len(extra_casing) | (len(sc[0]) << 24)
464 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500465 if cf != sc[0]:
466 lower |= len(cf) << 20
467 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500468 upper = len(extra_casing) | (len(sc[2]) << 24)
469 extra_casing.extend(sc[2])
470 # Title is probably equal to upper.
471 if sc[1] == sc[2]:
472 title = upper
473 else:
474 title = len(extra_casing) | (len(sc[1]) << 24)
475 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000476 # decimal digit, integer digit
477 decimal = 0
478 if record[6]:
479 flags |= DECIMAL_MASK
480 decimal = int(record[6])
481 digit = 0
482 if record[7]:
483 flags |= DIGIT_MASK
484 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000485 if record[8]:
486 flags |= NUMERIC_MASK
487 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000488 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000489 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000490 )
491 # add entry to index and item tables
492 i = cache.get(item)
493 if i is None:
494 cache[item] = i = len(table)
495 table.append(item)
496 index[char] = i
497
Collin Winter6afaeb72007-08-03 17:06:41 +0000498 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000499 print(sum(map(len, numeric.values())), "numeric code points")
500 print(len(spaces), "whitespace code points")
501 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500502 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000503
Collin Winter6afaeb72007-08-03 17:06:41 +0000504 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000505
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000506 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000507 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
508 print(file=fp)
509 print("/* a list of unique character type descriptors */", file=fp)
510 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000511 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000512 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
513 print("};", file=fp)
514 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000515
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500516 print("/* extended case mappings */", file=fp)
517 print(file=fp)
518 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
519 for c in extra_casing:
520 print(" %d," % c, file=fp)
521 print("};", file=fp)
522 print(file=fp)
523
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000524 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000525 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000526
Collin Winter6afaeb72007-08-03 17:06:41 +0000527 print("/* type indexes */", file=fp)
528 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000529 Array("index1", index1).dump(fp, trace)
530 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000531
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000532 # Generate code for _PyUnicode_ToNumeric()
533 numeric_items = sorted(numeric.items())
534 print('/* Returns the numeric value as double for Unicode characters', file=fp)
535 print(' * having this property, -1.0 otherwise.', file=fp)
536 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000537 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000538 print('{', file=fp)
539 print(' switch (ch) {', file=fp)
540 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000541 # Turn text into float literals
542 parts = value.split('/')
543 parts = [repr(float(part)) for part in parts]
544 value = '/'.join(parts)
545
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000546 codepoints.sort()
547 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000548 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000549 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print(' }', file=fp)
551 print(' return -1.0;', file=fp)
552 print('}', file=fp)
553 print(file=fp)
554
555 # Generate code for _PyUnicode_IsWhitespace()
556 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
557 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
558 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200559 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000560 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561 print(' switch (ch) {', file=fp)
562
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000563 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000564 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000565 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566
567 print(' }', file=fp)
568 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569 print('}', file=fp)
570 print(file=fp)
571
572 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000573 print("/* Returns 1 for Unicode characters having the line break", file=fp)
574 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
575 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000576 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200577 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print('{', file=fp)
579 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000580 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000581 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000582 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000583
584 print(' }', file=fp)
585 print(' return 0;', file=fp)
586 print('}', file=fp)
587 print(file=fp)
588
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000589 fp.close()
590
591# --------------------------------------------------------------------
592# unicode name database
593
594def makeunicodename(unicode, trace):
595
596 FILE = "Modules/unicodename_db.h"
597
Collin Winter6afaeb72007-08-03 17:06:41 +0000598 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000599
600 # collect names
601 names = [None] * len(unicode.chars)
602
603 for char in unicode.chars:
604 record = unicode.table[char]
605 if record:
606 name = record[1].strip()
607 if name and name[0] != "<":
608 names[char] = name + chr(0)
609
Georg Brandl559e5d72008-06-11 18:37:52 +0000610 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000611
612 # collect unique words from names (note that we differ between
613 # words inside a sentence, and words ending a sentence. the
614 # latter includes the trailing null byte.
615
616 words = {}
617 n = b = 0
618 for char in unicode.chars:
619 name = names[char]
620 if name:
621 w = name.split()
622 b = b + len(name)
623 n = n + len(w)
624 for w in w:
625 l = words.get(w)
626 if l:
627 l.append(None)
628 else:
629 words[w] = [len(words)]
630
Collin Winter6afaeb72007-08-03 17:06:41 +0000631 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000632
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000633 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000634
Martin v. Löwis97225da2002-11-24 23:05:09 +0000635 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000636 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000637 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000638 return -len(alist), aword
639 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000640
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000641 # figure out how many phrasebook escapes we need
642 escapes = 0
643 while escapes * 256 < len(wordlist):
644 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000645 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000646
647 short = 256 - escapes
648
649 assert short > 0
650
Collin Winter6afaeb72007-08-03 17:06:41 +0000651 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000652
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000653 # statistics
654 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000656 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000657 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000659 # pick the most commonly used words, and sort the rest on falling
660 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000662 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000663 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664 wordlist.extend(wordtail)
665
666 # generate lexicon from words
667
668 lexicon_offset = [0]
669 lexicon = ""
670 words = {}
671
672 # build a lexicon string
673 offset = 0
674 for w, x in wordlist:
675 # encoding: bit 7 indicates last character in word (chr(128)
676 # indicates the last character in an entire string)
677 ww = w[:-1] + chr(ord(w[-1])+128)
678 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000679 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000680 if o < 0:
681 o = offset
682 lexicon = lexicon + ww
683 offset = offset + len(w)
684 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000685 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000686
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000687 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688
689 # generate phrasebook from names and lexicon
690 phrasebook = [0]
691 phrasebook_offset = [0] * len(unicode.chars)
692 for char in unicode.chars:
693 name = names[char]
694 if name:
695 w = name.split()
696 phrasebook_offset[char] = len(phrasebook)
697 for w in w:
698 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000699 if i < short:
700 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000701 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000702 # store as two bytes
703 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000704 phrasebook.append(i&255)
705
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000706 assert getsize(phrasebook) == 1
707
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000708 #
709 # unicode name hash table
710
711 # extract names
712 data = []
713 for char in unicode.chars:
714 record = unicode.table[char]
715 if record:
716 name = record[1].strip()
717 if name and name[0] != "<":
718 data.append((name, char))
719
720 # the magic number 47 was chosen to minimize the number of
721 # collisions on the current data set. if you like, change it
722 # and see what happens...
723
724 codehash = Hash("code", data, 47)
725
Collin Winter6afaeb72007-08-03 17:06:41 +0000726 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000727
728 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000729 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
730 print(file=fp)
731 print("#define NAME_MAXLEN", 256, file=fp)
732 print(file=fp)
733 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000734 Array("lexicon", lexicon).dump(fp, trace)
735 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000736
737 # split decomposition index table
738 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
739
Collin Winter6afaeb72007-08-03 17:06:41 +0000740 print("/* code->name phrasebook */", file=fp)
741 print("#define phrasebook_shift", shift, file=fp)
742 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000743
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000744 Array("phrasebook", phrasebook).dump(fp, trace)
745 Array("phrasebook_offset1", offset1).dump(fp, trace)
746 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000747
Collin Winter6afaeb72007-08-03 17:06:41 +0000748 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000749 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000750
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300751 print(file=fp)
752 print('static const unsigned int aliases_start = %#x;' %
753 NAME_ALIASES_START, file=fp)
754 print('static const unsigned int aliases_end = %#x;' %
755 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
756
757 print('static const unsigned int name_aliases[] = {', file=fp)
758 for name, codepoint in unicode.aliases:
759 print(' 0x%04X,' % codepoint, file=fp)
760 print('};', file=fp)
761
762 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
763 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
764 # sequences or sequences with non-BMP chars are added.
765 # unicodedata_lookup should be adapted too.
766 print(dedent("""
767 typedef struct NamedSequence {
768 int seqlen;
769 Py_UCS2 seq[4];
770 } named_sequence;
771 """), file=fp)
772
773 print('static const unsigned int named_sequences_start = %#x;' %
774 NAMED_SEQUENCES_START, file=fp)
775 print('static const unsigned int named_sequences_end = %#x;' %
776 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
777
778 print('static const named_sequence named_sequences[] = {', file=fp)
779 for name, sequence in unicode.named_sequences:
780 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
781 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
782 print('};', file=fp)
783
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000784 fp.close()
785
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000786
787def merge_old_version(version, new, old):
788 # Changes to exclusion file not implemented yet
789 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000790 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000791
792 # In these change records, 0xFF means "no change"
793 bidir_changes = [0xFF]*0x110000
794 category_changes = [0xFF]*0x110000
795 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000796 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797 # In numeric data, 0 means "no change",
798 # -1 means "did not have a numeric value
799 numeric_changes = [0] * 0x110000
800 # normalization_changes is a list of key-value pairs
801 normalization_changes = []
802 for i in range(0x110000):
803 if new.table[i] is None:
804 # Characters unassigned in the new version ought to
805 # be unassigned in the old one
806 assert old.table[i] is None
807 continue
808 # check characters unassigned in the old version
809 if old.table[i] is None:
810 # category 0 is "unassigned"
811 category_changes[i] = 0
812 continue
813 # check characters that differ
814 if old.table[i] != new.table[i]:
815 for k in range(len(old.table[i])):
816 if old.table[i][k] != new.table[i][k]:
817 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300818 if k == 1 and i in PUA_15:
819 # the name is not set in the old.table, but in the
820 # new.table we are using it for aliases and named seq
821 assert value == ''
822 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000823 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
824 category_changes[i] = CATEGORY_NAMES.index(value)
825 elif k == 4:
826 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
827 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
828 elif k == 5:
829 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
830 # We assume that all normalization changes are in 1:1 mappings
831 assert " " not in value
832 normalization_changes.append((i, value))
833 elif k == 6:
834 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
835 # we only support changes where the old value is a single digit
836 assert value in "0123456789"
837 decimal_changes[i] = int(value)
838 elif k == 8:
839 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
840 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000841 if not value:
842 numeric_changes[i] = -1
843 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000844 numeric_changes[i] = float(value)
845 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000846 elif k == 9:
847 if value == 'Y':
848 mirrored_changes[i] = '1'
849 else:
850 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000851 elif k == 11:
852 # change to ISO comment, ignore
853 pass
854 elif k == 12:
855 # change to simple uppercase mapping; ignore
856 pass
857 elif k == 13:
858 # change to simple lowercase mapping; ignore
859 pass
860 elif k == 14:
861 # change to simple titlecase mapping; ignore
862 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000863 elif k == 16:
864 # derived property changes; not yet
865 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000866 elif k == 17:
867 # normalization quickchecks are not performed
868 # for older versions
869 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000870 else:
871 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000872 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000873 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000874 decimal_changes, mirrored_changes,
875 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000876 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000877
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000878def open_data(template, version):
879 local = template % ('-'+version,)
880 if not os.path.exists(local):
881 import urllib.request
882 if version == '3.2.0':
883 # irregular url structure
884 url = 'http://www.unicode.org/Public/3.2-Update/' + local
885 else:
886 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
887 urllib.request.urlretrieve(url, filename=local)
888 if local.endswith('.txt'):
889 return open(local, encoding='utf-8')
890 else:
891 # Unihan.zip
892 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000893
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000894# --------------------------------------------------------------------
895# the following support code is taken from the unidb utilities
896# Copyright (c) 1999-2000 by Secret Labs AB
897
898# load a unicode-data file from disk
899
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000900class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000901 # Record structure:
902 # [ID, name, category, combining, bidi, decomp, (6)
903 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
904 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
905 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000906
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000907 def __init__(self, version,
908 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000909 expand=1,
910 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000911 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000912 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300913 with open_data(UNICODE_DATA, version) as file:
914 while 1:
915 s = file.readline()
916 if not s:
917 break
918 s = s.strip().split(";")
919 char = int(s[0], 16)
920 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000921
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000922 cjk_ranges_found = []
923
Martin v. Löwis97225da2002-11-24 23:05:09 +0000924 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000925 if expand:
926 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000927 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000928 s = table[i]
929 if s:
930 if s[1][-6:] == "First>":
931 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000932 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000933 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000934 if s[1].startswith("<CJK Ideograph"):
935 cjk_ranges_found.append((field[0],
936 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000937 s[1] = ""
938 field = None
939 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000940 f2 = field[:]
941 f2[0] = "%X" % i
942 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000943 if cjk_check and cjk_ranges != cjk_ranges_found:
944 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000945
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000946 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000947 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000948 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000949 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000950
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300951 # check for name aliases and named sequences, see #12753
952 # aliases and named sequences are not in 3.2.0
953 if version != '3.2.0':
954 self.aliases = []
955 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
956 # in order to take advantage of the compression and lookup
957 # algorithms used for the other characters
958 pua_index = NAME_ALIASES_START
959 with open_data(NAME_ALIASES, version) as file:
960 for s in file:
961 s = s.strip()
962 if not s or s.startswith('#'):
963 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500964 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300965 char = int(char, 16)
966 self.aliases.append((name, char))
967 # also store the name in the PUA 1
968 self.table[pua_index][1] = name
969 pua_index += 1
970 assert pua_index - NAME_ALIASES_START == len(self.aliases)
971
972 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300973 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300974 # in order to take advantage of the compression and lookup
975 # algorithms used for the other characters.
976
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500977 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300978 pua_index = NAMED_SEQUENCES_START
979 with open_data(NAMED_SEQUENCES, version) as file:
980 for s in file:
981 s = s.strip()
982 if not s or s.startswith('#'):
983 continue
984 name, chars = s.split(';')
985 chars = tuple(int(char, 16) for char in chars.split())
986 # check that the structure defined in makeunicodename is OK
987 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
988 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
989 "the NamedSequence struct and in unicodedata_lookup")
990 self.named_sequences.append((name, chars))
991 # also store these in the PUA 1
992 self.table[pua_index][1] = name
993 pua_index += 1
994 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
995
Martin v. Löwis677bde22002-11-23 22:08:15 +0000996 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300997 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
998 for s in file:
999 s = s.strip()
1000 if not s:
1001 continue
1002 if s[0] == '#':
1003 continue
1004 char = int(s.split()[0],16)
1005 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001006
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001007 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001008 with open_data(EASTASIAN_WIDTH, version) as file:
1009 for s in file:
1010 s = s.strip()
1011 if not s:
1012 continue
1013 if s[0] == '#':
1014 continue
1015 s = s.split()[0].split(';')
1016 if '..' in s[0]:
1017 first, last = [int(c, 16) for c in s[0].split('..')]
1018 chars = list(range(first, last+1))
1019 else:
1020 chars = [int(s[0], 16)]
1021 for char in chars:
1022 widths[char] = s[1]
1023
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001024 for i in range(0, 0x110000):
1025 if table[i] is not None:
1026 table[i].append(widths[i])
1027
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001028 for i in range(0, 0x110000):
1029 if table[i] is not None:
1030 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001031
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001032 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1033 for s in file:
1034 s = s.split('#', 1)[0].strip()
1035 if not s:
1036 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001037
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001038 r, p = s.split(";")
1039 r = r.strip()
1040 p = p.strip()
1041 if ".." in r:
1042 first, last = [int(c, 16) for c in r.split('..')]
1043 chars = list(range(first, last+1))
1044 else:
1045 chars = [int(r, 16)]
1046 for char in chars:
1047 if table[char]:
1048 # Some properties (e.g. Default_Ignorable_Code_Point)
1049 # apply to unassigned code points; ignore them
1050 table[char][-1].add(p)
1051
1052 with open_data(LINE_BREAK, version) as file:
1053 for s in file:
1054 s = s.partition('#')[0]
1055 s = [i.strip() for i in s.split(';')]
1056 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1057 continue
1058 if '..' not in s[0]:
1059 first = last = int(s[0], 16)
1060 else:
1061 first, last = [int(c, 16) for c in s[0].split('..')]
1062 for char in range(first, last+1):
1063 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001064
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001065 # We only want the quickcheck properties
1066 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1067 # Yes is the default, hence only N and M occur
1068 # In 3.2.0, the format was different (NF?_NO)
1069 # The parsing will incorrectly determine these as
1070 # "yes", however, unicodedata.c will not perform quickchecks
1071 # for older versions, and no delta records will be created.
1072 quickchecks = [0] * 0x110000
1073 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001074 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1075 for s in file:
1076 if '#' in s:
1077 s = s[:s.index('#')]
1078 s = [i.strip() for i in s.split(';')]
1079 if len(s) < 2 or s[1] not in qc_order:
1080 continue
1081 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1082 quickcheck_shift = qc_order.index(s[1])*2
1083 quickcheck <<= quickcheck_shift
1084 if '..' not in s[0]:
1085 first = last = int(s[0], 16)
1086 else:
1087 first, last = [int(c, 16) for c in s[0].split('..')]
1088 for char in range(first, last+1):
1089 assert not (quickchecks[char]>>quickcheck_shift)&3
1090 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001091 for i in range(0, 0x110000):
1092 if table[i] is not None:
1093 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001094
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001095 with open_data(UNIHAN, version) as file:
1096 zip = zipfile.ZipFile(file)
1097 if version == '3.2.0':
1098 data = zip.open('Unihan-3.2.0.txt').read()
1099 else:
1100 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001101 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001102 if not line.startswith('U+'):
1103 continue
1104 code, tag, value = line.split(None, 3)[:3]
1105 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1106 'kOtherNumeric'):
1107 continue
1108 value = value.strip().replace(',', '')
1109 i = int(code[2:], 16)
1110 # Patch the numeric field
1111 if table[i] is not None:
1112 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001113 sc = self.special_casing = {}
1114 with open_data(SPECIAL_CASING, version) as file:
1115 for s in file:
1116 s = s[:-1].split('#', 1)[0]
1117 if not s:
1118 continue
1119 data = s.split("; ")
1120 if data[4]:
1121 # We ignore all conditionals (since they depend on
1122 # languages) except for one, which is hardcoded. See
1123 # handle_capital_sigma in unicodeobject.c.
1124 continue
1125 c = int(data[0], 16)
1126 lower = [int(char, 16) for char in data[1].split()]
1127 title = [int(char, 16) for char in data[2].split()]
1128 upper = [int(char, 16) for char in data[3].split()]
1129 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001130 cf = self.case_folding = {}
1131 if version != '3.2.0':
1132 with open_data(CASE_FOLDING, version) as file:
1133 for s in file:
1134 s = s[:-1].split('#', 1)[0]
1135 if not s:
1136 continue
1137 data = s.split("; ")
1138 if data[1] in "CF":
1139 c = int(data[0], 16)
1140 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001141
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001142 def uselatin1(self):
1143 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001144 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001145
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001146# hash table tools
1147
1148# this is a straight-forward reimplementation of Python's built-in
1149# dictionary type, using a static data structure, and a custom string
1150# hash algorithm.
1151
1152def myhash(s, magic):
1153 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001154 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001155 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001156 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001157 if ix:
1158 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1159 return h
1160
1161SIZES = [
1162 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1163 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1164 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1165 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1166]
1167
1168class Hash:
1169 def __init__(self, name, data, magic):
1170 # turn a (key, value) list into a static hash table structure
1171
1172 # determine table size
1173 for size, poly in SIZES:
1174 if size > len(data):
1175 poly = size + poly
1176 break
1177 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001178 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001179
Collin Winter6afaeb72007-08-03 17:06:41 +00001180 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001181
1182 table = [None] * size
1183
1184 mask = size-1
1185
1186 n = 0
1187
1188 hash = myhash
1189
1190 # initialize hash table
1191 for key, value in data:
1192 h = hash(key, magic)
1193 i = (~h) & mask
1194 v = table[i]
1195 if v is None:
1196 table[i] = value
1197 continue
1198 incr = (h ^ (h >> 3)) & mask;
1199 if not incr:
1200 incr = mask
1201 while 1:
1202 n = n + 1
1203 i = (i + incr) & mask
1204 v = table[i]
1205 if v is None:
1206 table[i] = value
1207 break
1208 incr = incr << 1
1209 if incr > mask:
1210 incr = incr ^ poly
1211
Collin Winter6afaeb72007-08-03 17:06:41 +00001212 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001213 self.collisions = n
1214
1215 for i in range(len(table)):
1216 if table[i] is None:
1217 table[i] = 0
1218
1219 self.data = Array(name + "_hash", table)
1220 self.magic = magic
1221 self.name = name
1222 self.size = size
1223 self.poly = poly
1224
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001225 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001226 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001227 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001228 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1229 file.write("#define %s_size %d\n" % (self.name, self.size))
1230 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1231
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001232# stuff to deal with arrays of unsigned integers
1233
1234class Array:
1235
1236 def __init__(self, name, data):
1237 self.name = name
1238 self.data = data
1239
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001240 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001241 # write data to file, as a C array
1242 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001243 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001244 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001245 file.write("static ")
1246 if size == 1:
1247 file.write("unsigned char")
1248 elif size == 2:
1249 file.write("unsigned short")
1250 else:
1251 file.write("unsigned int")
1252 file.write(" " + self.name + "[] = {\n")
1253 if self.data:
1254 s = " "
1255 for item in self.data:
1256 i = str(item) + ", "
1257 if len(s) + len(i) > 78:
1258 file.write(s + "\n")
1259 s = " " + i
1260 else:
1261 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001262 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001263 file.write(s + "\n")
1264 file.write("};\n\n")
1265
1266def getsize(data):
1267 # return smallest possible integer size for the given array
1268 maxdata = max(data)
1269 if maxdata < 256:
1270 return 1
1271 elif maxdata < 65536:
1272 return 2
1273 else:
1274 return 4
1275
Tim Peters21013482000-09-25 07:13:41 +00001276def splitbins(t, trace=0):
1277 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1278
1279 t is a sequence of ints. This function can be useful to save space if
1280 many of the ints are the same. t1 and t2 are lists of ints, and shift
1281 is an int, chosen to minimize the combined size of t1 and t2 (in C
1282 code), and where for each i in range(len(t)),
1283 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1284 where mask is a bitmask isolating the last "shift" bits.
1285
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001286 If optional arg trace is non-zero (default zero), progress info
1287 is printed to sys.stderr. The higher the value, the more info
1288 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001289 """
1290
Tim Peters21013482000-09-25 07:13:41 +00001291 if trace:
1292 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001293 print("%d+%d bins at shift %d; %d bytes" % (
1294 len(t1), len(t2), shift, bytes), file=sys.stderr)
1295 print("Size of original table:", len(t)*getsize(t), \
1296 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001297 n = len(t)-1 # last valid index
1298 maxshift = 0 # the most we can shift n and still have something left
1299 if n > 0:
1300 while n >> 1:
1301 n >>= 1
1302 maxshift += 1
1303 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001304 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001305 t = tuple(t) # so slices can be dict keys
1306 for shift in range(maxshift + 1):
1307 t1 = []
1308 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001309 size = 2**shift
1310 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001311 for i in range(0, len(t), size):
1312 bin = t[i:i+size]
1313 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001314 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001315 index = len(t2)
1316 bincache[bin] = index
1317 t2.extend(bin)
1318 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001319 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001320 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001321 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001322 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001323 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001324 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001325 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001326 t1, t2, shift = best
1327 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001328 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001329 dump(t1, t2, shift, bytes)
1330 if __debug__:
1331 # exhaustively verify that the decomposition is correct
1332 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001333 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001334 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1335 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001336
1337if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001338 maketables(1)