blob: 713e175c501aa84c76f4213a49ef7d044761ae72 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040044# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson48013832015-06-27 15:45:56 -050045UNIDATA_VERSION = "8.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000046UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000049UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000050DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000051DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000052LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050055SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050056CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030057
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050065NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066
67old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72 "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040076 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000078EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
Florent Xicluna806d8cf2010-03-30 19:34:18 +000080MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000087LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000088SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000091XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000093PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050094NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000098
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101 ('3400', '4DB5'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500102 ('4E00', '9FD5'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000103 ('20000', '2A6D6'),
104 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500105 ('2B740', '2B81D'),
106 ('2B820', '2CEA1'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000107]
108
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000109def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110
Collin Winter6afaeb72007-08-03 17:06:41 +0000111 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000112
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000114 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000115
Georg Brandl559e5d72008-06-11 18:37:52 +0000116 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000117
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000118 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000119 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000120 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000121 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 merge_old_version(version, unicode, old_unicode)
123
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000124 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000125 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127
128# --------------------------------------------------------------------
129# unicode character properties
130
131def makeunicodedata(unicode, trace):
132
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000133 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000134 table = [dummy]
135 cache = {0: dummy}
136 index = [0] * len(unicode.chars)
137
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000138 FILE = "Modules/unicodedata_db.h"
139
Collin Winter6afaeb72007-08-03 17:06:41 +0000140 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000143
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000144 for char in unicode.chars:
145 record = unicode.table[char]
146 if record:
147 # extract database properties
148 category = CATEGORY_NAMES.index(record[2])
149 combining = int(record[3])
150 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
151 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000152 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000153 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000154 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000155 category, combining, bidirectional, mirrored, eastasianwidth,
156 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 )
158 # add entry to index and item tables
159 i = cache.get(item)
160 if i is None:
161 cache[item] = i = len(table)
162 table.append(item)
163 index[char] = i
164
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000165 # 2) decomposition data
166
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 decomp_data = [0]
168 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171
Martin v. Löwis677bde22002-11-23 22:08:15 +0000172 comp_pairs = []
173 comp_first = [None] * len(unicode.chars)
174 comp_last = [None] * len(unicode.chars)
175
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000176 for char in unicode.chars:
177 record = unicode.table[char]
178 if record:
179 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000180 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000181 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000182 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 # prefix
184 if decomp[0][0] == "<":
185 prefix = decomp.pop(0)
186 else:
187 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000188 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000189 i = decomp_prefix.index(prefix)
190 except ValueError:
191 i = len(decomp_prefix)
192 decomp_prefix.append(prefix)
193 prefix = i
194 assert prefix < 256
195 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000196 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000197 # Collect NFC pairs
198 if not prefix and len(decomp) == 3 and \
199 char not in unicode.exclusions and \
200 unicode.table[decomp[1]][3] == "0":
201 p, l, r = decomp
202 comp_first[l] = 1
203 comp_last[r] = 1
204 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205 try:
206 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000207 except ValueError:
208 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000209 decomp_data.extend(decomp)
210 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000211 else:
212 i = 0
213 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000214
Martin v. Löwis677bde22002-11-23 22:08:15 +0000215 f = l = 0
216 comp_first_ranges = []
217 comp_last_ranges = []
218 prev_f = prev_l = None
219 for i in unicode.chars:
220 if comp_first[i] is not None:
221 comp_first[i] = f
222 f += 1
223 if prev_f is None:
224 prev_f = (i,i)
225 elif prev_f[1]+1 == i:
226 prev_f = prev_f[0],i
227 else:
228 comp_first_ranges.append(prev_f)
229 prev_f = (i,i)
230 if comp_last[i] is not None:
231 comp_last[i] = l
232 l += 1
233 if prev_l is None:
234 prev_l = (i,i)
235 elif prev_l[1]+1 == i:
236 prev_l = prev_l[0],i
237 else:
238 comp_last_ranges.append(prev_l)
239 prev_l = (i,i)
240 comp_first_ranges.append(prev_f)
241 comp_last_ranges.append(prev_l)
242 total_first = f
243 total_last = l
244
245 comp_data = [0]*(total_first*total_last)
246 for f,l,char in comp_pairs:
247 f = comp_first[f]
248 l = comp_last[l]
249 comp_data[f*total_last+l] = char
250
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print(len(table), "unique properties")
252 print(len(decomp_prefix), "unique decomposition prefixes")
253 print(len(decomp_data), "unique decomposition entries:", end=' ')
254 print(decomp_size, "bytes")
255 print(total_first, "first characters in NFC")
256 print(total_last, "last characters in NFC")
257 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000258
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000260
Fred Drake9c685052000-10-26 03:56:46 +0000261 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
263 print(file=fp)
264 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
265 print("/* a list of unique database records */", file=fp)
266 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000267 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000268 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print("};", file=fp)
270 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000271
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print("/* Reindexing of NFC first characters. */", file=fp)
273 print("#define TOTAL_FIRST",total_first, file=fp)
274 print("#define TOTAL_LAST",total_last, file=fp)
275 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000276 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
279 print(" {0,0,0}", file=fp)
280 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000281 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000282 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
284 print(" {0,0,0}", file=fp)
285 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000286
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000287 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # the support code moved into unicodedatabase.c
289
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* string literals */", file=fp)
291 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print(" \"%s\"," % name, file=fp)
294 print(" NULL", file=fp)
295 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000298 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print(" \"%s\"," % name, file=fp)
300 print(" NULL", file=fp)
301 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000304 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print(" \"%s\"," % name, file=fp)
306 print(" NULL", file=fp)
307 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000308
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000310 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000311 print(" \"%s\"," % name, file=fp)
312 print(" NULL", file=fp)
313 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000314
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000315 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000316 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000317
Collin Winter6afaeb72007-08-03 17:06:41 +0000318 print("/* index tables for the database records */", file=fp)
319 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000320 Array("index1", index1).dump(fp, trace)
321 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000322
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000323 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000324 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000325
Collin Winter6afaeb72007-08-03 17:06:41 +0000326 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327 Array("decomp_data", decomp_data).dump(fp, trace)
328
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("/* index tables for the decomposition data */", file=fp)
330 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000331 Array("decomp_index1", index1).dump(fp, trace)
332 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000333
Martin v. Löwis677bde22002-11-23 22:08:15 +0000334 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000335 print("/* NFC pairs */", file=fp)
336 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000337 Array("comp_index", index).dump(fp, trace)
338 Array("comp_data", index2).dump(fp, trace)
339
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000340 # Generate delta tables for old versions
341 for version, table, normalization in unicode.changed:
342 cversion = version.replace(".","_")
343 records = [table[0]]
344 cache = {table[0]:0}
345 index = [0] * len(table)
346 for i, record in enumerate(table):
347 try:
348 index[i] = cache[record]
349 except KeyError:
350 index[i] = cache[record] = len(records)
351 records.append(record)
352 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000355 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
356 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
358 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000359 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
360 print("{", file=fp)
361 print("\tint index;", file=fp)
362 print("\tif (n >= 0x110000) index = 0;", file=fp)
363 print("\telse {", file=fp)
364 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
365 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
366 (cversion, shift, ((1<<shift)-1)), file=fp)
367 print("\t}", file=fp)
368 print("\treturn change_records_%s+index;" % cversion, file=fp)
369 print("}\n", file=fp)
370 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
371 print("{", file=fp)
372 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000373 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000374 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
375 print("\tdefault: return 0;", file=fp)
376 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000378 fp.close()
379
380# --------------------------------------------------------------------
381# unicode character type tables
382
383def makeunicodetype(unicode, trace):
384
385 FILE = "Objects/unicodetype_db.h"
386
Collin Winter6afaeb72007-08-03 17:06:41 +0000387 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388
389 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000390 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 table = [dummy]
392 cache = {0: dummy}
393 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000394 numeric = {}
395 spaces = []
396 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500397 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398
399 for char in unicode.chars:
400 record = unicode.table[char]
401 if record:
402 # extract database properties
403 category = record[2]
404 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000405 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000407 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
409 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500410 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000412 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000413 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000414 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000415 if category == "Zs" or bidirectional in ("WS", "B", "S"):
416 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000418 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000419 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500420 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000422 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000423 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000424 if "XID_Start" in properties:
425 flags |= XID_START_MASK
426 if "XID_Continue" in properties:
427 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500428 if "Cased" in properties:
429 flags |= CASED_MASK
430 if "Case_Ignorable" in properties:
431 flags |= CASE_IGNORABLE_MASK
432 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500433 cf = unicode.case_folding.get(char, [char])
434 if record[12]:
435 upper = int(record[12], 16)
436 else:
437 upper = char
438 if record[13]:
439 lower = int(record[13], 16)
440 else:
441 lower = char
442 if record[14]:
443 title = int(record[14], 16)
444 else:
445 title = upper
446 if sc is None and cf != [lower]:
447 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500448 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500449 if upper == lower == title:
450 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500451 else:
452 upper = upper - char
453 lower = lower - char
454 title = title - char
455 assert (abs(upper) <= 2147483647 and
456 abs(lower) <= 2147483647 and
457 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000458 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500459 # This happens either when some character maps to more than one
460 # character in uppercase, lowercase, or titlecase or the
461 # casefolded version of the character is different from the
462 # lowercase. The extra characters are stored in a different
463 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500464 flags |= EXTENDED_CASE_MASK
465 lower = len(extra_casing) | (len(sc[0]) << 24)
466 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500467 if cf != sc[0]:
468 lower |= len(cf) << 20
469 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500470 upper = len(extra_casing) | (len(sc[2]) << 24)
471 extra_casing.extend(sc[2])
472 # Title is probably equal to upper.
473 if sc[1] == sc[2]:
474 title = upper
475 else:
476 title = len(extra_casing) | (len(sc[1]) << 24)
477 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000478 # decimal digit, integer digit
479 decimal = 0
480 if record[6]:
481 flags |= DECIMAL_MASK
482 decimal = int(record[6])
483 digit = 0
484 if record[7]:
485 flags |= DIGIT_MASK
486 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000487 if record[8]:
488 flags |= NUMERIC_MASK
489 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000490 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000491 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000492 )
493 # add entry to index and item tables
494 i = cache.get(item)
495 if i is None:
496 cache[item] = i = len(table)
497 table.append(item)
498 index[char] = i
499
Collin Winter6afaeb72007-08-03 17:06:41 +0000500 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000501 print(sum(map(len, numeric.values())), "numeric code points")
502 print(len(spaces), "whitespace code points")
503 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500504 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000505
Collin Winter6afaeb72007-08-03 17:06:41 +0000506 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000507
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000508 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
510 print(file=fp)
511 print("/* a list of unique character type descriptors */", file=fp)
512 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000514 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
515 print("};", file=fp)
516 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000517
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500518 print("/* extended case mappings */", file=fp)
519 print(file=fp)
520 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
521 for c in extra_casing:
522 print(" %d," % c, file=fp)
523 print("};", file=fp)
524 print(file=fp)
525
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000526 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000527 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000528
Collin Winter6afaeb72007-08-03 17:06:41 +0000529 print("/* type indexes */", file=fp)
530 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000531 Array("index1", index1).dump(fp, trace)
532 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000534 # Generate code for _PyUnicode_ToNumeric()
535 numeric_items = sorted(numeric.items())
536 print('/* Returns the numeric value as double for Unicode characters', file=fp)
537 print(' * having this property, -1.0 otherwise.', file=fp)
538 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000539 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000540 print('{', file=fp)
541 print(' switch (ch) {', file=fp)
542 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000543 # Turn text into float literals
544 parts = value.split('/')
545 parts = [repr(float(part)) for part in parts]
546 value = '/'.join(parts)
547
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000548 codepoints.sort()
549 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552 print(' }', file=fp)
553 print(' return -1.0;', file=fp)
554 print('}', file=fp)
555 print(file=fp)
556
557 # Generate code for _PyUnicode_IsWhitespace()
558 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
559 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
560 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200561 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563 print(' switch (ch) {', file=fp)
564
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000565 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000567 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000568
569 print(' }', file=fp)
570 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000571 print('}', file=fp)
572 print(file=fp)
573
574 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000575 print("/* Returns 1 for Unicode characters having the line break", file=fp)
576 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
577 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200579 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000580 print('{', file=fp)
581 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000582 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000583 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000584 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000585
586 print(' }', file=fp)
587 print(' return 0;', file=fp)
588 print('}', file=fp)
589 print(file=fp)
590
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000591 fp.close()
592
593# --------------------------------------------------------------------
594# unicode name database
595
596def makeunicodename(unicode, trace):
597
598 FILE = "Modules/unicodename_db.h"
599
Collin Winter6afaeb72007-08-03 17:06:41 +0000600 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601
602 # collect names
603 names = [None] * len(unicode.chars)
604
605 for char in unicode.chars:
606 record = unicode.table[char]
607 if record:
608 name = record[1].strip()
609 if name and name[0] != "<":
610 names[char] = name + chr(0)
611
Georg Brandl559e5d72008-06-11 18:37:52 +0000612 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
614 # collect unique words from names (note that we differ between
615 # words inside a sentence, and words ending a sentence. the
616 # latter includes the trailing null byte.
617
618 words = {}
619 n = b = 0
620 for char in unicode.chars:
621 name = names[char]
622 if name:
623 w = name.split()
624 b = b + len(name)
625 n = n + len(w)
626 for w in w:
627 l = words.get(w)
628 if l:
629 l.append(None)
630 else:
631 words[w] = [len(words)]
632
Collin Winter6afaeb72007-08-03 17:06:41 +0000633 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000634
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000635 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000636
Martin v. Löwis97225da2002-11-24 23:05:09 +0000637 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000638 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000640 return -len(alist), aword
641 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000642
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643 # figure out how many phrasebook escapes we need
644 escapes = 0
645 while escapes * 256 < len(wordlist):
646 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000647 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000648
649 short = 256 - escapes
650
651 assert short > 0
652
Collin Winter6afaeb72007-08-03 17:06:41 +0000653 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000654
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000655 # statistics
656 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000657 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000659 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 # pick the most commonly used words, and sort the rest on falling
662 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000663
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000664 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000665 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000666 wordlist.extend(wordtail)
667
668 # generate lexicon from words
669
670 lexicon_offset = [0]
671 lexicon = ""
672 words = {}
673
674 # build a lexicon string
675 offset = 0
676 for w, x in wordlist:
677 # encoding: bit 7 indicates last character in word (chr(128)
678 # indicates the last character in an entire string)
679 ww = w[:-1] + chr(ord(w[-1])+128)
680 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000681 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682 if o < 0:
683 o = offset
684 lexicon = lexicon + ww
685 offset = offset + len(w)
686 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000687 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000689 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000690
691 # generate phrasebook from names and lexicon
692 phrasebook = [0]
693 phrasebook_offset = [0] * len(unicode.chars)
694 for char in unicode.chars:
695 name = names[char]
696 if name:
697 w = name.split()
698 phrasebook_offset[char] = len(phrasebook)
699 for w in w:
700 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000701 if i < short:
702 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000703 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000704 # store as two bytes
705 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706 phrasebook.append(i&255)
707
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 assert getsize(phrasebook) == 1
709
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 #
711 # unicode name hash table
712
713 # extract names
714 data = []
715 for char in unicode.chars:
716 record = unicode.table[char]
717 if record:
718 name = record[1].strip()
719 if name and name[0] != "<":
720 data.append((name, char))
721
722 # the magic number 47 was chosen to minimize the number of
723 # collisions on the current data set. if you like, change it
724 # and see what happens...
725
726 codehash = Hash("code", data, 47)
727
Collin Winter6afaeb72007-08-03 17:06:41 +0000728 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000729
730 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000731 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
732 print(file=fp)
733 print("#define NAME_MAXLEN", 256, file=fp)
734 print(file=fp)
735 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000736 Array("lexicon", lexicon).dump(fp, trace)
737 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000738
739 # split decomposition index table
740 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
741
Collin Winter6afaeb72007-08-03 17:06:41 +0000742 print("/* code->name phrasebook */", file=fp)
743 print("#define phrasebook_shift", shift, file=fp)
744 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000745
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000746 Array("phrasebook", phrasebook).dump(fp, trace)
747 Array("phrasebook_offset1", offset1).dump(fp, trace)
748 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000749
Collin Winter6afaeb72007-08-03 17:06:41 +0000750 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000751 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000752
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300753 print(file=fp)
754 print('static const unsigned int aliases_start = %#x;' %
755 NAME_ALIASES_START, file=fp)
756 print('static const unsigned int aliases_end = %#x;' %
757 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
758
759 print('static const unsigned int name_aliases[] = {', file=fp)
760 for name, codepoint in unicode.aliases:
761 print(' 0x%04X,' % codepoint, file=fp)
762 print('};', file=fp)
763
764 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
765 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
766 # sequences or sequences with non-BMP chars are added.
767 # unicodedata_lookup should be adapted too.
768 print(dedent("""
769 typedef struct NamedSequence {
770 int seqlen;
771 Py_UCS2 seq[4];
772 } named_sequence;
773 """), file=fp)
774
775 print('static const unsigned int named_sequences_start = %#x;' %
776 NAMED_SEQUENCES_START, file=fp)
777 print('static const unsigned int named_sequences_end = %#x;' %
778 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
779
780 print('static const named_sequence named_sequences[] = {', file=fp)
781 for name, sequence in unicode.named_sequences:
782 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
783 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
784 print('};', file=fp)
785
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000786 fp.close()
787
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788
789def merge_old_version(version, new, old):
790 # Changes to exclusion file not implemented yet
791 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000792 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000793
794 # In these change records, 0xFF means "no change"
795 bidir_changes = [0xFF]*0x110000
796 category_changes = [0xFF]*0x110000
797 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000798 mirrored_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799 # In numeric data, 0 means "no change",
800 # -1 means "did not have a numeric value
801 numeric_changes = [0] * 0x110000
802 # normalization_changes is a list of key-value pairs
803 normalization_changes = []
804 for i in range(0x110000):
805 if new.table[i] is None:
806 # Characters unassigned in the new version ought to
807 # be unassigned in the old one
808 assert old.table[i] is None
809 continue
810 # check characters unassigned in the old version
811 if old.table[i] is None:
812 # category 0 is "unassigned"
813 category_changes[i] = 0
814 continue
815 # check characters that differ
816 if old.table[i] != new.table[i]:
817 for k in range(len(old.table[i])):
818 if old.table[i][k] != new.table[i][k]:
819 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300820 if k == 1 and i in PUA_15:
821 # the name is not set in the old.table, but in the
822 # new.table we are using it for aliases and named seq
823 assert value == ''
824 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000825 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
826 category_changes[i] = CATEGORY_NAMES.index(value)
827 elif k == 4:
828 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
829 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
830 elif k == 5:
831 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
832 # We assume that all normalization changes are in 1:1 mappings
833 assert " " not in value
834 normalization_changes.append((i, value))
835 elif k == 6:
836 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
837 # we only support changes where the old value is a single digit
838 assert value in "0123456789"
839 decimal_changes[i] = int(value)
840 elif k == 8:
841 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
842 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 if not value:
844 numeric_changes[i] = -1
845 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000846 numeric_changes[i] = float(value)
847 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000848 elif k == 9:
849 if value == 'Y':
850 mirrored_changes[i] = '1'
851 else:
852 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000853 elif k == 11:
854 # change to ISO comment, ignore
855 pass
856 elif k == 12:
857 # change to simple uppercase mapping; ignore
858 pass
859 elif k == 13:
860 # change to simple lowercase mapping; ignore
861 pass
862 elif k == 14:
863 # change to simple titlecase mapping; ignore
864 pass
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000865 elif k == 16:
866 # derived property changes; not yet
867 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000868 elif k == 17:
869 # normalization quickchecks are not performed
870 # for older versions
871 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000872 else:
873 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000874 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000875 new.changed.append((version, list(zip(bidir_changes, category_changes,
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000876 decimal_changes, mirrored_changes,
877 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000878 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000879
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000880def open_data(template, version):
881 local = template % ('-'+version,)
882 if not os.path.exists(local):
883 import urllib.request
884 if version == '3.2.0':
885 # irregular url structure
886 url = 'http://www.unicode.org/Public/3.2-Update/' + local
887 else:
888 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
889 urllib.request.urlretrieve(url, filename=local)
890 if local.endswith('.txt'):
891 return open(local, encoding='utf-8')
892 else:
893 # Unihan.zip
894 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000895
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000896# --------------------------------------------------------------------
897# the following support code is taken from the unidb utilities
898# Copyright (c) 1999-2000 by Secret Labs AB
899
900# load a unicode-data file from disk
901
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000902class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000903 # Record structure:
904 # [ID, name, category, combining, bidi, decomp, (6)
905 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
906 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
907 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000908
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000909 def __init__(self, version,
910 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000911 expand=1,
912 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000913 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000914 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300915 with open_data(UNICODE_DATA, version) as file:
916 while 1:
917 s = file.readline()
918 if not s:
919 break
920 s = s.strip().split(";")
921 char = int(s[0], 16)
922 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000923
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000924 cjk_ranges_found = []
925
Martin v. Löwis97225da2002-11-24 23:05:09 +0000926 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000927 if expand:
928 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000929 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000930 s = table[i]
931 if s:
932 if s[1][-6:] == "First>":
933 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000934 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000935 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000936 if s[1].startswith("<CJK Ideograph"):
937 cjk_ranges_found.append((field[0],
938 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000939 s[1] = ""
940 field = None
941 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000942 f2 = field[:]
943 f2[0] = "%X" % i
944 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000945 if cjk_check and cjk_ranges != cjk_ranges_found:
946 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000947
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000948 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000949 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000950 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000951 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000952
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300953 # check for name aliases and named sequences, see #12753
954 # aliases and named sequences are not in 3.2.0
955 if version != '3.2.0':
956 self.aliases = []
957 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
958 # in order to take advantage of the compression and lookup
959 # algorithms used for the other characters
960 pua_index = NAME_ALIASES_START
961 with open_data(NAME_ALIASES, version) as file:
962 for s in file:
963 s = s.strip()
964 if not s or s.startswith('#'):
965 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500966 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300967 char = int(char, 16)
968 self.aliases.append((name, char))
969 # also store the name in the PUA 1
970 self.table[pua_index][1] = name
971 pua_index += 1
972 assert pua_index - NAME_ALIASES_START == len(self.aliases)
973
974 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300975 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300976 # in order to take advantage of the compression and lookup
977 # algorithms used for the other characters.
978
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500979 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300980 pua_index = NAMED_SEQUENCES_START
981 with open_data(NAMED_SEQUENCES, version) as file:
982 for s in file:
983 s = s.strip()
984 if not s or s.startswith('#'):
985 continue
986 name, chars = s.split(';')
987 chars = tuple(int(char, 16) for char in chars.split())
988 # check that the structure defined in makeunicodename is OK
989 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
990 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
991 "the NamedSequence struct and in unicodedata_lookup")
992 self.named_sequences.append((name, chars))
993 # also store these in the PUA 1
994 self.table[pua_index][1] = name
995 pua_index += 1
996 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
997
Martin v. Löwis677bde22002-11-23 22:08:15 +0000998 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300999 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1000 for s in file:
1001 s = s.strip()
1002 if not s:
1003 continue
1004 if s[0] == '#':
1005 continue
1006 char = int(s.split()[0],16)
1007 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001008
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001009 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001010 with open_data(EASTASIAN_WIDTH, version) as file:
1011 for s in file:
1012 s = s.strip()
1013 if not s:
1014 continue
1015 if s[0] == '#':
1016 continue
1017 s = s.split()[0].split(';')
1018 if '..' in s[0]:
1019 first, last = [int(c, 16) for c in s[0].split('..')]
1020 chars = list(range(first, last+1))
1021 else:
1022 chars = [int(s[0], 16)]
1023 for char in chars:
1024 widths[char] = s[1]
1025
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001026 for i in range(0, 0x110000):
1027 if table[i] is not None:
1028 table[i].append(widths[i])
1029
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001030 for i in range(0, 0x110000):
1031 if table[i] is not None:
1032 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001033
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001034 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1035 for s in file:
1036 s = s.split('#', 1)[0].strip()
1037 if not s:
1038 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001039
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001040 r, p = s.split(";")
1041 r = r.strip()
1042 p = p.strip()
1043 if ".." in r:
1044 first, last = [int(c, 16) for c in r.split('..')]
1045 chars = list(range(first, last+1))
1046 else:
1047 chars = [int(r, 16)]
1048 for char in chars:
1049 if table[char]:
1050 # Some properties (e.g. Default_Ignorable_Code_Point)
1051 # apply to unassigned code points; ignore them
1052 table[char][-1].add(p)
1053
1054 with open_data(LINE_BREAK, version) as file:
1055 for s in file:
1056 s = s.partition('#')[0]
1057 s = [i.strip() for i in s.split(';')]
1058 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1059 continue
1060 if '..' not in s[0]:
1061 first = last = int(s[0], 16)
1062 else:
1063 first, last = [int(c, 16) for c in s[0].split('..')]
1064 for char in range(first, last+1):
1065 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001066
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001067 # We only want the quickcheck properties
1068 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1069 # Yes is the default, hence only N and M occur
1070 # In 3.2.0, the format was different (NF?_NO)
1071 # The parsing will incorrectly determine these as
1072 # "yes", however, unicodedata.c will not perform quickchecks
1073 # for older versions, and no delta records will be created.
1074 quickchecks = [0] * 0x110000
1075 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001076 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1077 for s in file:
1078 if '#' in s:
1079 s = s[:s.index('#')]
1080 s = [i.strip() for i in s.split(';')]
1081 if len(s) < 2 or s[1] not in qc_order:
1082 continue
1083 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1084 quickcheck_shift = qc_order.index(s[1])*2
1085 quickcheck <<= quickcheck_shift
1086 if '..' not in s[0]:
1087 first = last = int(s[0], 16)
1088 else:
1089 first, last = [int(c, 16) for c in s[0].split('..')]
1090 for char in range(first, last+1):
1091 assert not (quickchecks[char]>>quickcheck_shift)&3
1092 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001093 for i in range(0, 0x110000):
1094 if table[i] is not None:
1095 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001096
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001097 with open_data(UNIHAN, version) as file:
1098 zip = zipfile.ZipFile(file)
1099 if version == '3.2.0':
1100 data = zip.open('Unihan-3.2.0.txt').read()
1101 else:
1102 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001103 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001104 if not line.startswith('U+'):
1105 continue
1106 code, tag, value = line.split(None, 3)[:3]
1107 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1108 'kOtherNumeric'):
1109 continue
1110 value = value.strip().replace(',', '')
1111 i = int(code[2:], 16)
1112 # Patch the numeric field
1113 if table[i] is not None:
1114 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001115 sc = self.special_casing = {}
1116 with open_data(SPECIAL_CASING, version) as file:
1117 for s in file:
1118 s = s[:-1].split('#', 1)[0]
1119 if not s:
1120 continue
1121 data = s.split("; ")
1122 if data[4]:
1123 # We ignore all conditionals (since they depend on
1124 # languages) except for one, which is hardcoded. See
1125 # handle_capital_sigma in unicodeobject.c.
1126 continue
1127 c = int(data[0], 16)
1128 lower = [int(char, 16) for char in data[1].split()]
1129 title = [int(char, 16) for char in data[2].split()]
1130 upper = [int(char, 16) for char in data[3].split()]
1131 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001132 cf = self.case_folding = {}
1133 if version != '3.2.0':
1134 with open_data(CASE_FOLDING, version) as file:
1135 for s in file:
1136 s = s[:-1].split('#', 1)[0]
1137 if not s:
1138 continue
1139 data = s.split("; ")
1140 if data[1] in "CF":
1141 c = int(data[0], 16)
1142 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001143
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001144 def uselatin1(self):
1145 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001146 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001147
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001148# hash table tools
1149
1150# this is a straight-forward reimplementation of Python's built-in
1151# dictionary type, using a static data structure, and a custom string
1152# hash algorithm.
1153
1154def myhash(s, magic):
1155 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001156 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001157 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001158 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001159 if ix:
1160 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1161 return h
1162
1163SIZES = [
1164 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1165 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1166 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1167 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1168]
1169
1170class Hash:
1171 def __init__(self, name, data, magic):
1172 # turn a (key, value) list into a static hash table structure
1173
1174 # determine table size
1175 for size, poly in SIZES:
1176 if size > len(data):
1177 poly = size + poly
1178 break
1179 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001180 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001181
Collin Winter6afaeb72007-08-03 17:06:41 +00001182 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001183
1184 table = [None] * size
1185
1186 mask = size-1
1187
1188 n = 0
1189
1190 hash = myhash
1191
1192 # initialize hash table
1193 for key, value in data:
1194 h = hash(key, magic)
1195 i = (~h) & mask
1196 v = table[i]
1197 if v is None:
1198 table[i] = value
1199 continue
1200 incr = (h ^ (h >> 3)) & mask;
1201 if not incr:
1202 incr = mask
1203 while 1:
1204 n = n + 1
1205 i = (i + incr) & mask
1206 v = table[i]
1207 if v is None:
1208 table[i] = value
1209 break
1210 incr = incr << 1
1211 if incr > mask:
1212 incr = incr ^ poly
1213
Collin Winter6afaeb72007-08-03 17:06:41 +00001214 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001215 self.collisions = n
1216
1217 for i in range(len(table)):
1218 if table[i] is None:
1219 table[i] = 0
1220
1221 self.data = Array(name + "_hash", table)
1222 self.magic = magic
1223 self.name = name
1224 self.size = size
1225 self.poly = poly
1226
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001227 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001228 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001229 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001230 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1231 file.write("#define %s_size %d\n" % (self.name, self.size))
1232 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1233
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001234# stuff to deal with arrays of unsigned integers
1235
1236class Array:
1237
1238 def __init__(self, name, data):
1239 self.name = name
1240 self.data = data
1241
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001242 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001243 # write data to file, as a C array
1244 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001245 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001246 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001247 file.write("static ")
1248 if size == 1:
1249 file.write("unsigned char")
1250 elif size == 2:
1251 file.write("unsigned short")
1252 else:
1253 file.write("unsigned int")
1254 file.write(" " + self.name + "[] = {\n")
1255 if self.data:
1256 s = " "
1257 for item in self.data:
1258 i = str(item) + ", "
1259 if len(s) + len(i) > 78:
1260 file.write(s + "\n")
1261 s = " " + i
1262 else:
1263 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001264 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001265 file.write(s + "\n")
1266 file.write("};\n\n")
1267
1268def getsize(data):
1269 # return smallest possible integer size for the given array
1270 maxdata = max(data)
1271 if maxdata < 256:
1272 return 1
1273 elif maxdata < 65536:
1274 return 2
1275 else:
1276 return 4
1277
Tim Peters21013482000-09-25 07:13:41 +00001278def splitbins(t, trace=0):
1279 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1280
1281 t is a sequence of ints. This function can be useful to save space if
1282 many of the ints are the same. t1 and t2 are lists of ints, and shift
1283 is an int, chosen to minimize the combined size of t1 and t2 (in C
1284 code), and where for each i in range(len(t)),
1285 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1286 where mask is a bitmask isolating the last "shift" bits.
1287
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001288 If optional arg trace is non-zero (default zero), progress info
1289 is printed to sys.stderr. The higher the value, the more info
1290 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001291 """
1292
Tim Peters21013482000-09-25 07:13:41 +00001293 if trace:
1294 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001295 print("%d+%d bins at shift %d; %d bytes" % (
1296 len(t1), len(t2), shift, bytes), file=sys.stderr)
1297 print("Size of original table:", len(t)*getsize(t), \
1298 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001299 n = len(t)-1 # last valid index
1300 maxshift = 0 # the most we can shift n and still have something left
1301 if n > 0:
1302 while n >> 1:
1303 n >>= 1
1304 maxshift += 1
1305 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001306 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001307 t = tuple(t) # so slices can be dict keys
1308 for shift in range(maxshift + 1):
1309 t1 = []
1310 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001311 size = 2**shift
1312 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001313 for i in range(0, len(t), size):
1314 bin = t[i:i+size]
1315 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001316 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001317 index = len(t2)
1318 bincache[bin] = index
1319 t2.extend(bin)
1320 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001321 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001322 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001323 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001324 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001325 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001326 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001327 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001328 t1, t2, shift = best
1329 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001330 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001331 dump(t1, t2, shift, bytes)
1332 if __debug__:
1333 # exhaustively verify that the decomposition is correct
1334 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001335 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001336 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1337 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001338
1339if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001340 maketables(1)