blob: b7d31e8806652b58b98e76b665849fc0ee8ef315 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040044# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson3032ed72014-07-06 13:04:20 -070045UNIDATA_VERSION = "7.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000046UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000049UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000050DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000051DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000052LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050055SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050056CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030057
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050065NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066
67old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72 "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040076 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000078EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
Florent Xicluna806d8cf2010-03-30 19:34:18 +000080MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000087LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000088SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000091XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000093PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050094NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000098
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101 ('3400', '4DB5'),
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500102 ('4E00', '9FCC'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000103 ('20000', '2A6D6'),
104 ('2A700', '2B734'),
105 ('2B740', '2B81D')
106]
107
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000108def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000109
Collin Winter6afaeb72007-08-03 17:06:41 +0000110 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000111
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000113 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000114
Georg Brandl559e5d72008-06-11 18:37:52 +0000115 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000116
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000117 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000118 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000119 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000120 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 merge_old_version(version, unicode, old_unicode)
122
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000123 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000124 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000125 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000126
127# --------------------------------------------------------------------
128# unicode character properties
129
130def makeunicodedata(unicode, trace):
131
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000132 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000133 table = [dummy]
134 cache = {0: dummy}
135 index = [0] * len(unicode.chars)
136
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000137 FILE = "Modules/unicodedata_db.h"
138
Collin Winter6afaeb72007-08-03 17:06:41 +0000139 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000140
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000141 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000142
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000143 for char in unicode.chars:
144 record = unicode.table[char]
145 if record:
146 # extract database properties
147 category = CATEGORY_NAMES.index(record[2])
148 combining = int(record[3])
149 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
150 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000151 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000152 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000153 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000154 category, combining, bidirectional, mirrored, eastasianwidth,
155 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000156 )
157 # add entry to index and item tables
158 i = cache.get(item)
159 if i is None:
160 cache[item] = i = len(table)
161 table.append(item)
162 index[char] = i
163
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000164 # 2) decomposition data
165
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000166 decomp_data = [0]
167 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000168 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170
Martin v. Löwis677bde22002-11-23 22:08:15 +0000171 comp_pairs = []
172 comp_first = [None] * len(unicode.chars)
173 comp_last = [None] * len(unicode.chars)
174
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000175 for char in unicode.chars:
176 record = unicode.table[char]
177 if record:
178 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000179 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000180 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000181 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 # prefix
183 if decomp[0][0] == "<":
184 prefix = decomp.pop(0)
185 else:
186 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000187 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000188 i = decomp_prefix.index(prefix)
189 except ValueError:
190 i = len(decomp_prefix)
191 decomp_prefix.append(prefix)
192 prefix = i
193 assert prefix < 256
194 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000195 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000196 # Collect NFC pairs
197 if not prefix and len(decomp) == 3 and \
198 char not in unicode.exclusions and \
199 unicode.table[decomp[1]][3] == "0":
200 p, l, r = decomp
201 comp_first[l] = 1
202 comp_last[r] = 1
203 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 try:
205 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000206 except ValueError:
207 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208 decomp_data.extend(decomp)
209 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000210 else:
211 i = 0
212 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000213
Martin v. Löwis677bde22002-11-23 22:08:15 +0000214 f = l = 0
215 comp_first_ranges = []
216 comp_last_ranges = []
217 prev_f = prev_l = None
218 for i in unicode.chars:
219 if comp_first[i] is not None:
220 comp_first[i] = f
221 f += 1
222 if prev_f is None:
223 prev_f = (i,i)
224 elif prev_f[1]+1 == i:
225 prev_f = prev_f[0],i
226 else:
227 comp_first_ranges.append(prev_f)
228 prev_f = (i,i)
229 if comp_last[i] is not None:
230 comp_last[i] = l
231 l += 1
232 if prev_l is None:
233 prev_l = (i,i)
234 elif prev_l[1]+1 == i:
235 prev_l = prev_l[0],i
236 else:
237 comp_last_ranges.append(prev_l)
238 prev_l = (i,i)
239 comp_first_ranges.append(prev_f)
240 comp_last_ranges.append(prev_l)
241 total_first = f
242 total_last = l
243
244 comp_data = [0]*(total_first*total_last)
245 for f,l,char in comp_pairs:
246 f = comp_first[f]
247 l = comp_last[l]
248 comp_data[f*total_last+l] = char
249
Collin Winter6afaeb72007-08-03 17:06:41 +0000250 print(len(table), "unique properties")
251 print(len(decomp_prefix), "unique decomposition prefixes")
252 print(len(decomp_data), "unique decomposition entries:", end=' ')
253 print(decomp_size, "bytes")
254 print(total_first, "first characters in NFC")
255 print(total_last, "last characters in NFC")
256 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000257
Collin Winter6afaeb72007-08-03 17:06:41 +0000258 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000259
Fred Drake9c685052000-10-26 03:56:46 +0000260 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000261 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
262 print(file=fp)
263 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
264 print("/* a list of unique database records */", file=fp)
265 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000266 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000267 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000268 print("};", file=fp)
269 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000270
Collin Winter6afaeb72007-08-03 17:06:41 +0000271 print("/* Reindexing of NFC first characters. */", file=fp)
272 print("#define TOTAL_FIRST",total_first, file=fp)
273 print("#define TOTAL_LAST",total_last, file=fp)
274 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000275 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000276 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000277 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
278 print(" {0,0,0}", file=fp)
279 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000280 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000281 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000282 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
283 print(" {0,0,0}", file=fp)
284 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000285
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000286 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000287 # the support code moved into unicodedatabase.c
288
Collin Winter6afaeb72007-08-03 17:06:41 +0000289 print("/* string literals */", file=fp)
290 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000291 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000292 print(" \"%s\"," % name, file=fp)
293 print(" NULL", file=fp)
294 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000295
Collin Winter6afaeb72007-08-03 17:06:41 +0000296 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000298 print(" \"%s\"," % name, file=fp)
299 print(" NULL", file=fp)
300 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000301
Collin Winter6afaeb72007-08-03 17:06:41 +0000302 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000303 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000304 print(" \"%s\"," % name, file=fp)
305 print(" NULL", file=fp)
306 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000307
Collin Winter6afaeb72007-08-03 17:06:41 +0000308 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000309 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000310 print(" \"%s\"," % name, file=fp)
311 print(" NULL", file=fp)
312 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000313
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000314 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000315 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000316
Collin Winter6afaeb72007-08-03 17:06:41 +0000317 print("/* index tables for the database records */", file=fp)
318 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000319 Array("index1", index1).dump(fp, trace)
320 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000321
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000322 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000323 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000324
Collin Winter6afaeb72007-08-03 17:06:41 +0000325 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000326 Array("decomp_data", decomp_data).dump(fp, trace)
327
Collin Winter6afaeb72007-08-03 17:06:41 +0000328 print("/* index tables for the decomposition data */", file=fp)
329 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000330 Array("decomp_index1", index1).dump(fp, trace)
331 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000332
Martin v. Löwis677bde22002-11-23 22:08:15 +0000333 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000334 print("/* NFC pairs */", file=fp)
335 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000336 Array("comp_index", index).dump(fp, trace)
337 Array("comp_data", index2).dump(fp, trace)
338
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000339 # Generate delta tables for old versions
340 for version, table, normalization in unicode.changed:
341 cversion = version.replace(".","_")
342 records = [table[0]]
343 cache = {table[0]:0}
344 index = [0] * len(table)
345 for i, record in enumerate(table):
346 try:
347 index[i] = cache[record]
348 except KeyError:
349 index[i] = cache[record] = len(records)
350 records.append(record)
351 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000352 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000353 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000354 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
355 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
357 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000358 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
359 print("{", file=fp)
360 print("\tint index;", file=fp)
361 print("\tif (n >= 0x110000) index = 0;", file=fp)
362 print("\telse {", file=fp)
363 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
364 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
365 (cversion, shift, ((1<<shift)-1)), file=fp)
366 print("\t}", file=fp)
367 print("\treturn change_records_%s+index;" % cversion, file=fp)
368 print("}\n", file=fp)
369 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
370 print("{", file=fp)
371 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000372 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000373 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
374 print("\tdefault: return 0;", file=fp)
375 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000376
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000377 fp.close()
378
379# --------------------------------------------------------------------
380# unicode character type tables
381
382def makeunicodetype(unicode, trace):
383
384 FILE = "Objects/unicodetype_db.h"
385
Collin Winter6afaeb72007-08-03 17:06:41 +0000386 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000387
388 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000389 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000390 table = [dummy]
391 cache = {0: dummy}
392 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000393 numeric = {}
394 spaces = []
395 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500396 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000397
398 for char in unicode.chars:
399 record = unicode.table[char]
400 if record:
401 # extract database properties
402 category = record[2]
403 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000404 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000405 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000406 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000407 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
408 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500409 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000410 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000411 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000412 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000413 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000414 if category == "Zs" or bidirectional in ("WS", "B", "S"):
415 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000416 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000417 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000418 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500419 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000421 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000422 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000423 if "XID_Start" in properties:
424 flags |= XID_START_MASK
425 if "XID_Continue" in properties:
426 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500427 if "Cased" in properties:
428 flags |= CASED_MASK
429 if "Case_Ignorable" in properties:
430 flags |= CASE_IGNORABLE_MASK
431 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500432 cf = unicode.case_folding.get(char, [char])
433 if record[12]:
434 upper = int(record[12], 16)
435 else:
436 upper = char
437 if record[13]:
438 lower = int(record[13], 16)
439 else:
440 lower = char
441 if record[14]:
442 title = int(record[14], 16)
443 else:
444 title = upper
445 if sc is None and cf != [lower]:
446 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500447 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500448 if upper == lower == title:
449 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500450 else:
451 upper = upper - char
452 lower = lower - char
453 title = title - char
454 assert (abs(upper) <= 2147483647 and
455 abs(lower) <= 2147483647 and
456 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000457 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500458 # This happens either when some character maps to more than one
459 # character in uppercase, lowercase, or titlecase or the
460 # casefolded version of the character is different from the
461 # lowercase. The extra characters are stored in a different
462 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500463 flags |= EXTENDED_CASE_MASK
464 lower = len(extra_casing) | (len(sc[0]) << 24)
465 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500466 if cf != sc[0]:
467 lower |= len(cf) << 20
468 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500469 upper = len(extra_casing) | (len(sc[2]) << 24)
470 extra_casing.extend(sc[2])
471 # Title is probably equal to upper.
472 if sc[1] == sc[2]:
473 title = upper
474 else:
475 title = len(extra_casing) | (len(sc[1]) << 24)
476 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000477 # decimal digit, integer digit
478 decimal = 0
479 if record[6]:
480 flags |= DECIMAL_MASK
481 decimal = int(record[6])
482 digit = 0
483 if record[7]:
484 flags |= DIGIT_MASK
485 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000486 if record[8]:
487 flags |= NUMERIC_MASK
488 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000489 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000490 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000491 )
492 # add entry to index and item tables
493 i = cache.get(item)
494 if i is None:
495 cache[item] = i = len(table)
496 table.append(item)
497 index[char] = i
498
Collin Winter6afaeb72007-08-03 17:06:41 +0000499 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000500 print(sum(map(len, numeric.values())), "numeric code points")
501 print(len(spaces), "whitespace code points")
502 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500503 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000504
Collin Winter6afaeb72007-08-03 17:06:41 +0000505 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000506
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000507 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000508 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
509 print(file=fp)
510 print("/* a list of unique character type descriptors */", file=fp)
511 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000512 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000513 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
514 print("};", file=fp)
515 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000516
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500517 print("/* extended case mappings */", file=fp)
518 print(file=fp)
519 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
520 for c in extra_casing:
521 print(" %d," % c, file=fp)
522 print("};", file=fp)
523 print(file=fp)
524
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000525 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000526 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000527
Collin Winter6afaeb72007-08-03 17:06:41 +0000528 print("/* type indexes */", file=fp)
529 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000530 Array("index1", index1).dump(fp, trace)
531 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000532
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000533 # Generate code for _PyUnicode_ToNumeric()
534 numeric_items = sorted(numeric.items())
535 print('/* Returns the numeric value as double for Unicode characters', file=fp)
536 print(' * having this property, -1.0 otherwise.', file=fp)
537 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000538 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000539 print('{', file=fp)
540 print(' switch (ch) {', file=fp)
541 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000542 # Turn text into float literals
543 parts = value.split('/')
544 parts = [repr(float(part)) for part in parts]
545 value = '/'.join(parts)
546
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000547 codepoints.sort()
548 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000549 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551 print(' }', file=fp)
552 print(' return -1.0;', file=fp)
553 print('}', file=fp)
554 print(file=fp)
555
556 # Generate code for _PyUnicode_IsWhitespace()
557 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
558 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
559 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200560 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000561 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print(' switch (ch) {', file=fp)
563
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000564 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000565 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000567
568 print(' }', file=fp)
569 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000570 print('}', file=fp)
571 print(file=fp)
572
573 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000574 print("/* Returns 1 for Unicode characters having the line break", file=fp)
575 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
576 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000577 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200578 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579 print('{', file=fp)
580 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000581 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000582 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000583 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000584
585 print(' }', file=fp)
586 print(' return 0;', file=fp)
587 print('}', file=fp)
588 print(file=fp)
589
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000590 fp.close()
591
592# --------------------------------------------------------------------
593# unicode name database
594
595def makeunicodename(unicode, trace):
596
597 FILE = "Modules/unicodename_db.h"
598
Collin Winter6afaeb72007-08-03 17:06:41 +0000599 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000600
601 # collect names
602 names = [None] * len(unicode.chars)
603
604 for char in unicode.chars:
605 record = unicode.table[char]
606 if record:
607 name = record[1].strip()
608 if name and name[0] != "<":
609 names[char] = name + chr(0)
610
Georg Brandl559e5d72008-06-11 18:37:52 +0000611 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000612
613 # collect unique words from names (note that we differ between
614 # words inside a sentence, and words ending a sentence. the
615 # latter includes the trailing null byte.
616
617 words = {}
618 n = b = 0
619 for char in unicode.chars:
620 name = names[char]
621 if name:
622 w = name.split()
623 b = b + len(name)
624 n = n + len(w)
625 for w in w:
626 l = words.get(w)
627 if l:
628 l.append(None)
629 else:
630 words[w] = [len(words)]
631
Collin Winter6afaeb72007-08-03 17:06:41 +0000632 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000633
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000634 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635
Martin v. Löwis97225da2002-11-24 23:05:09 +0000636 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000637 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000638 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000639 return -len(alist), aword
640 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000641
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000642 # figure out how many phrasebook escapes we need
643 escapes = 0
644 while escapes * 256 < len(wordlist):
645 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000646 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000647
648 short = 256 - escapes
649
650 assert short > 0
651
Collin Winter6afaeb72007-08-03 17:06:41 +0000652 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000653
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000654 # statistics
655 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000656 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000657 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000658 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000660 # pick the most commonly used words, and sort the rest on falling
661 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000662
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000663 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000664 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000665 wordlist.extend(wordtail)
666
667 # generate lexicon from words
668
669 lexicon_offset = [0]
670 lexicon = ""
671 words = {}
672
673 # build a lexicon string
674 offset = 0
675 for w, x in wordlist:
676 # encoding: bit 7 indicates last character in word (chr(128)
677 # indicates the last character in an entire string)
678 ww = w[:-1] + chr(ord(w[-1])+128)
679 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000680 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000681 if o < 0:
682 o = offset
683 lexicon = lexicon + ww
684 offset = offset + len(w)
685 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000686 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000687
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000688 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000689
690 # generate phrasebook from names and lexicon
691 phrasebook = [0]
692 phrasebook_offset = [0] * len(unicode.chars)
693 for char in unicode.chars:
694 name = names[char]
695 if name:
696 w = name.split()
697 phrasebook_offset[char] = len(phrasebook)
698 for w in w:
699 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000700 if i < short:
701 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000702 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000703 # store as two bytes
704 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000705 phrasebook.append(i&255)
706
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000707 assert getsize(phrasebook) == 1
708
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000709 #
710 # unicode name hash table
711
712 # extract names
713 data = []
714 for char in unicode.chars:
715 record = unicode.table[char]
716 if record:
717 name = record[1].strip()
718 if name and name[0] != "<":
719 data.append((name, char))
720
721 # the magic number 47 was chosen to minimize the number of
722 # collisions on the current data set. if you like, change it
723 # and see what happens...
724
725 codehash = Hash("code", data, 47)
726
Collin Winter6afaeb72007-08-03 17:06:41 +0000727 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000728
729 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000730 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
731 print(file=fp)
732 print("#define NAME_MAXLEN", 256, file=fp)
733 print(file=fp)
734 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000735 Array("lexicon", lexicon).dump(fp, trace)
736 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000737
738 # split decomposition index table
739 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
740
Collin Winter6afaeb72007-08-03 17:06:41 +0000741 print("/* code->name phrasebook */", file=fp)
742 print("#define phrasebook_shift", shift, file=fp)
743 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000744
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000745 Array("phrasebook", phrasebook).dump(fp, trace)
746 Array("phrasebook_offset1", offset1).dump(fp, trace)
747 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000748
Collin Winter6afaeb72007-08-03 17:06:41 +0000749 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000750 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000751
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300752 print(file=fp)
753 print('static const unsigned int aliases_start = %#x;' %
754 NAME_ALIASES_START, file=fp)
755 print('static const unsigned int aliases_end = %#x;' %
756 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
757
758 print('static const unsigned int name_aliases[] = {', file=fp)
759 for name, codepoint in unicode.aliases:
760 print(' 0x%04X,' % codepoint, file=fp)
761 print('};', file=fp)
762
763 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
764 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
765 # sequences or sequences with non-BMP chars are added.
766 # unicodedata_lookup should be adapted too.
767 print(dedent("""
768 typedef struct NamedSequence {
769 int seqlen;
770 Py_UCS2 seq[4];
771 } named_sequence;
772 """), file=fp)
773
774 print('static const unsigned int named_sequences_start = %#x;' %
775 NAMED_SEQUENCES_START, file=fp)
776 print('static const unsigned int named_sequences_end = %#x;' %
777 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
778
779 print('static const named_sequence named_sequences[] = {', file=fp)
780 for name, sequence in unicode.named_sequences:
781 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
782 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
783 print('};', file=fp)
784
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000785 fp.close()
786
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000787
788def merge_old_version(version, new, old):
789 # Changes to exclusion file not implemented yet
790 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000791 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792
793 # In these change records, 0xFF means "no change"
794 bidir_changes = [0xFF]*0x110000
795 category_changes = [0xFF]*0x110000
796 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000797 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000798 # In numeric data, 0 means "no change",
799 # -1 means "did not have a numeric value
800 numeric_changes = [0] * 0x110000
801 # normalization_changes is a list of key-value pairs
802 normalization_changes = []
803 for i in range(0x110000):
804 if new.table[i] is None:
805 # Characters unassigned in the new version ought to
806 # be unassigned in the old one
807 assert old.table[i] is None
808 continue
809 # check characters unassigned in the old version
810 if old.table[i] is None:
811 # category 0 is "unassigned"
812 category_changes[i] = 0
813 continue
814 # check characters that differ
815 if old.table[i] != new.table[i]:
816 for k in range(len(old.table[i])):
817 if old.table[i][k] != new.table[i][k]:
818 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300819 if k == 1 and i in PUA_15:
820 # the name is not set in the old.table, but in the
821 # new.table we are using it for aliases and named seq
822 assert value == ''
823 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000824 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
825 category_changes[i] = CATEGORY_NAMES.index(value)
826 elif k == 4:
827 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
828 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
829 elif k == 5:
830 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
831 # We assume that all normalization changes are in 1:1 mappings
832 assert " " not in value
833 normalization_changes.append((i, value))
834 elif k == 6:
835 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
836 # we only support changes where the old value is a single digit
837 assert value in "0123456789"
838 decimal_changes[i] = int(value)
839 elif k == 8:
840 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
841 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000842 if not value:
843 numeric_changes[i] = -1
844 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000845 numeric_changes[i] = float(value)
846 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000847 elif k == 9:
848 if value == 'Y':
849 mirrored_changes[i] = '1'
850 else:
851 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000852 elif k == 11:
853 # change to ISO comment, ignore
854 pass
855 elif k == 12:
856 # change to simple uppercase mapping; ignore
857 pass
858 elif k == 13:
859 # change to simple lowercase mapping; ignore
860 pass
861 elif k == 14:
862 # change to simple titlecase mapping; ignore
863 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000864 elif k == 16:
865 # derived property changes; not yet
866 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000867 elif k == 17:
868 # normalization quickchecks are not performed
869 # for older versions
870 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000871 else:
872 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000873 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000874 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000875 decimal_changes, mirrored_changes,
876 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000877 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000878
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000879def open_data(template, version):
880 local = template % ('-'+version,)
881 if not os.path.exists(local):
882 import urllib.request
883 if version == '3.2.0':
884 # irregular url structure
885 url = 'http://www.unicode.org/Public/3.2-Update/' + local
886 else:
887 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
888 urllib.request.urlretrieve(url, filename=local)
889 if local.endswith('.txt'):
890 return open(local, encoding='utf-8')
891 else:
892 # Unihan.zip
893 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000894
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000895# --------------------------------------------------------------------
896# the following support code is taken from the unidb utilities
897# Copyright (c) 1999-2000 by Secret Labs AB
898
899# load a unicode-data file from disk
900
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000901class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000902 # Record structure:
903 # [ID, name, category, combining, bidi, decomp, (6)
904 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
905 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
906 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000907
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000908 def __init__(self, version,
909 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000910 expand=1,
911 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000912 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000913 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300914 with open_data(UNICODE_DATA, version) as file:
915 while 1:
916 s = file.readline()
917 if not s:
918 break
919 s = s.strip().split(";")
920 char = int(s[0], 16)
921 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000922
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000923 cjk_ranges_found = []
924
Martin v. Löwis97225da2002-11-24 23:05:09 +0000925 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000926 if expand:
927 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000928 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000929 s = table[i]
930 if s:
931 if s[1][-6:] == "First>":
932 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000933 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000934 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000935 if s[1].startswith("<CJK Ideograph"):
936 cjk_ranges_found.append((field[0],
937 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000938 s[1] = ""
939 field = None
940 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000941 f2 = field[:]
942 f2[0] = "%X" % i
943 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000944 if cjk_check and cjk_ranges != cjk_ranges_found:
945 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000946
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000947 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000948 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000949 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000950 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000951
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300952 # check for name aliases and named sequences, see #12753
953 # aliases and named sequences are not in 3.2.0
954 if version != '3.2.0':
955 self.aliases = []
956 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
957 # in order to take advantage of the compression and lookup
958 # algorithms used for the other characters
959 pua_index = NAME_ALIASES_START
960 with open_data(NAME_ALIASES, version) as file:
961 for s in file:
962 s = s.strip()
963 if not s or s.startswith('#'):
964 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500965 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300966 char = int(char, 16)
967 self.aliases.append((name, char))
968 # also store the name in the PUA 1
969 self.table[pua_index][1] = name
970 pua_index += 1
971 assert pua_index - NAME_ALIASES_START == len(self.aliases)
972
973 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300974 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300975 # in order to take advantage of the compression and lookup
976 # algorithms used for the other characters.
977
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500978 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300979 pua_index = NAMED_SEQUENCES_START
980 with open_data(NAMED_SEQUENCES, version) as file:
981 for s in file:
982 s = s.strip()
983 if not s or s.startswith('#'):
984 continue
985 name, chars = s.split(';')
986 chars = tuple(int(char, 16) for char in chars.split())
987 # check that the structure defined in makeunicodename is OK
988 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
989 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
990 "the NamedSequence struct and in unicodedata_lookup")
991 self.named_sequences.append((name, chars))
992 # also store these in the PUA 1
993 self.table[pua_index][1] = name
994 pua_index += 1
995 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
996
Martin v. Löwis677bde22002-11-23 22:08:15 +0000997 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300998 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
999 for s in file:
1000 s = s.strip()
1001 if not s:
1002 continue
1003 if s[0] == '#':
1004 continue
1005 char = int(s.split()[0],16)
1006 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001007
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001008 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001009 with open_data(EASTASIAN_WIDTH, version) as file:
1010 for s in file:
1011 s = s.strip()
1012 if not s:
1013 continue
1014 if s[0] == '#':
1015 continue
1016 s = s.split()[0].split(';')
1017 if '..' in s[0]:
1018 first, last = [int(c, 16) for c in s[0].split('..')]
1019 chars = list(range(first, last+1))
1020 else:
1021 chars = [int(s[0], 16)]
1022 for char in chars:
1023 widths[char] = s[1]
1024
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001025 for i in range(0, 0x110000):
1026 if table[i] is not None:
1027 table[i].append(widths[i])
1028
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001029 for i in range(0, 0x110000):
1030 if table[i] is not None:
1031 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001032
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001033 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1034 for s in file:
1035 s = s.split('#', 1)[0].strip()
1036 if not s:
1037 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001038
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001039 r, p = s.split(";")
1040 r = r.strip()
1041 p = p.strip()
1042 if ".." in r:
1043 first, last = [int(c, 16) for c in r.split('..')]
1044 chars = list(range(first, last+1))
1045 else:
1046 chars = [int(r, 16)]
1047 for char in chars:
1048 if table[char]:
1049 # Some properties (e.g. Default_Ignorable_Code_Point)
1050 # apply to unassigned code points; ignore them
1051 table[char][-1].add(p)
1052
1053 with open_data(LINE_BREAK, version) as file:
1054 for s in file:
1055 s = s.partition('#')[0]
1056 s = [i.strip() for i in s.split(';')]
1057 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1058 continue
1059 if '..' not in s[0]:
1060 first = last = int(s[0], 16)
1061 else:
1062 first, last = [int(c, 16) for c in s[0].split('..')]
1063 for char in range(first, last+1):
1064 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001065
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001066 # We only want the quickcheck properties
1067 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1068 # Yes is the default, hence only N and M occur
1069 # In 3.2.0, the format was different (NF?_NO)
1070 # The parsing will incorrectly determine these as
1071 # "yes", however, unicodedata.c will not perform quickchecks
1072 # for older versions, and no delta records will be created.
1073 quickchecks = [0] * 0x110000
1074 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001075 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1076 for s in file:
1077 if '#' in s:
1078 s = s[:s.index('#')]
1079 s = [i.strip() for i in s.split(';')]
1080 if len(s) < 2 or s[1] not in qc_order:
1081 continue
1082 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1083 quickcheck_shift = qc_order.index(s[1])*2
1084 quickcheck <<= quickcheck_shift
1085 if '..' not in s[0]:
1086 first = last = int(s[0], 16)
1087 else:
1088 first, last = [int(c, 16) for c in s[0].split('..')]
1089 for char in range(first, last+1):
1090 assert not (quickchecks[char]>>quickcheck_shift)&3
1091 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001092 for i in range(0, 0x110000):
1093 if table[i] is not None:
1094 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001095
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001096 with open_data(UNIHAN, version) as file:
1097 zip = zipfile.ZipFile(file)
1098 if version == '3.2.0':
1099 data = zip.open('Unihan-3.2.0.txt').read()
1100 else:
1101 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001102 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001103 if not line.startswith('U+'):
1104 continue
1105 code, tag, value = line.split(None, 3)[:3]
1106 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1107 'kOtherNumeric'):
1108 continue
1109 value = value.strip().replace(',', '')
1110 i = int(code[2:], 16)
1111 # Patch the numeric field
1112 if table[i] is not None:
1113 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001114 sc = self.special_casing = {}
1115 with open_data(SPECIAL_CASING, version) as file:
1116 for s in file:
1117 s = s[:-1].split('#', 1)[0]
1118 if not s:
1119 continue
1120 data = s.split("; ")
1121 if data[4]:
1122 # We ignore all conditionals (since they depend on
1123 # languages) except for one, which is hardcoded. See
1124 # handle_capital_sigma in unicodeobject.c.
1125 continue
1126 c = int(data[0], 16)
1127 lower = [int(char, 16) for char in data[1].split()]
1128 title = [int(char, 16) for char in data[2].split()]
1129 upper = [int(char, 16) for char in data[3].split()]
1130 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001131 cf = self.case_folding = {}
1132 if version != '3.2.0':
1133 with open_data(CASE_FOLDING, version) as file:
1134 for s in file:
1135 s = s[:-1].split('#', 1)[0]
1136 if not s:
1137 continue
1138 data = s.split("; ")
1139 if data[1] in "CF":
1140 c = int(data[0], 16)
1141 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001142
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001143 def uselatin1(self):
1144 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001145 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001146
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001147# hash table tools
1148
1149# this is a straight-forward reimplementation of Python's built-in
1150# dictionary type, using a static data structure, and a custom string
1151# hash algorithm.
1152
1153def myhash(s, magic):
1154 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001155 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001156 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001157 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001158 if ix:
1159 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1160 return h
1161
1162SIZES = [
1163 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1164 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1165 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1166 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1167]
1168
1169class Hash:
1170 def __init__(self, name, data, magic):
1171 # turn a (key, value) list into a static hash table structure
1172
1173 # determine table size
1174 for size, poly in SIZES:
1175 if size > len(data):
1176 poly = size + poly
1177 break
1178 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001179 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001180
Collin Winter6afaeb72007-08-03 17:06:41 +00001181 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001182
1183 table = [None] * size
1184
1185 mask = size-1
1186
1187 n = 0
1188
1189 hash = myhash
1190
1191 # initialize hash table
1192 for key, value in data:
1193 h = hash(key, magic)
1194 i = (~h) & mask
1195 v = table[i]
1196 if v is None:
1197 table[i] = value
1198 continue
1199 incr = (h ^ (h >> 3)) & mask;
1200 if not incr:
1201 incr = mask
1202 while 1:
1203 n = n + 1
1204 i = (i + incr) & mask
1205 v = table[i]
1206 if v is None:
1207 table[i] = value
1208 break
1209 incr = incr << 1
1210 if incr > mask:
1211 incr = incr ^ poly
1212
Collin Winter6afaeb72007-08-03 17:06:41 +00001213 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001214 self.collisions = n
1215
1216 for i in range(len(table)):
1217 if table[i] is None:
1218 table[i] = 0
1219
1220 self.data = Array(name + "_hash", table)
1221 self.magic = magic
1222 self.name = name
1223 self.size = size
1224 self.poly = poly
1225
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001226 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001227 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001228 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001229 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1230 file.write("#define %s_size %d\n" % (self.name, self.size))
1231 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1232
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001233# stuff to deal with arrays of unsigned integers
1234
1235class Array:
1236
1237 def __init__(self, name, data):
1238 self.name = name
1239 self.data = data
1240
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001241 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001242 # write data to file, as a C array
1243 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001244 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001245 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001246 file.write("static ")
1247 if size == 1:
1248 file.write("unsigned char")
1249 elif size == 2:
1250 file.write("unsigned short")
1251 else:
1252 file.write("unsigned int")
1253 file.write(" " + self.name + "[] = {\n")
1254 if self.data:
1255 s = " "
1256 for item in self.data:
1257 i = str(item) + ", "
1258 if len(s) + len(i) > 78:
1259 file.write(s + "\n")
1260 s = " " + i
1261 else:
1262 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001263 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001264 file.write(s + "\n")
1265 file.write("};\n\n")
1266
1267def getsize(data):
1268 # return smallest possible integer size for the given array
1269 maxdata = max(data)
1270 if maxdata < 256:
1271 return 1
1272 elif maxdata < 65536:
1273 return 2
1274 else:
1275 return 4
1276
Tim Peters21013482000-09-25 07:13:41 +00001277def splitbins(t, trace=0):
1278 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1279
1280 t is a sequence of ints. This function can be useful to save space if
1281 many of the ints are the same. t1 and t2 are lists of ints, and shift
1282 is an int, chosen to minimize the combined size of t1 and t2 (in C
1283 code), and where for each i in range(len(t)),
1284 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1285 where mask is a bitmask isolating the last "shift" bits.
1286
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001287 If optional arg trace is non-zero (default zero), progress info
1288 is printed to sys.stderr. The higher the value, the more info
1289 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001290 """
1291
Tim Peters21013482000-09-25 07:13:41 +00001292 if trace:
1293 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001294 print("%d+%d bins at shift %d; %d bytes" % (
1295 len(t1), len(t2), shift, bytes), file=sys.stderr)
1296 print("Size of original table:", len(t)*getsize(t), \
1297 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001298 n = len(t)-1 # last valid index
1299 maxshift = 0 # the most we can shift n and still have something left
1300 if n > 0:
1301 while n >> 1:
1302 n >>= 1
1303 maxshift += 1
1304 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001305 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001306 t = tuple(t) # so slices can be dict keys
1307 for shift in range(maxshift + 1):
1308 t1 = []
1309 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001310 size = 2**shift
1311 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001312 for i in range(0, len(t), size):
1313 bin = t[i:i+size]
1314 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001315 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001316 index = len(t2)
1317 bincache[bin] = index
1318 t2.extend(bin)
1319 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001320 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001321 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001322 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001323 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001324 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001325 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001326 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001327 t1, t2, shift = best
1328 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001329 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001330 dump(t1, t2, shift, bytes)
1331 if __debug__:
1332 # exhaustively verify that the decomposition is correct
1333 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001334 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001335 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1336 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001337
1338if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001339 maketables(1)