blob: 39c9fe78f46188fa12ebe55854c4c926e7348bd0 [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040044# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson279a9622017-06-22 22:31:08 -070045UNIDATA_VERSION = "10.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000046UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000049UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000050DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000051DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000052LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050055SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050056CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030057
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050065NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066
67old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72 "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040076 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000078EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
Florent Xicluna806d8cf2010-03-30 19:34:18 +000080MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000087LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000088SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000091XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000093PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050094NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000098
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101 ('3400', '4DB5'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700102 ('4E00', '9FEA'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000103 ('20000', '2A6D6'),
104 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500105 ('2B740', '2B81D'),
106 ('2B820', '2CEA1'),
Benjamin Peterson279a9622017-06-22 22:31:08 -0700107 ('2CEB0', '2EBE0'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000108]
109
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000110def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000111
Collin Winter6afaeb72007-08-03 17:06:41 +0000112 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000113
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000114 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000115 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000116
Georg Brandl559e5d72008-06-11 18:37:52 +0000117 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000118
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000119 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000120 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000121 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000122 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123 merge_old_version(version, unicode, old_unicode)
124
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000125 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000126 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000127 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000128
129# --------------------------------------------------------------------
130# unicode character properties
131
132def makeunicodedata(unicode, trace):
133
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000134 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000135 table = [dummy]
136 cache = {0: dummy}
137 index = [0] * len(unicode.chars)
138
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000139 FILE = "Modules/unicodedata_db.h"
140
Collin Winter6afaeb72007-08-03 17:06:41 +0000141 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000142
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000143 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000144
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000145 for char in unicode.chars:
146 record = unicode.table[char]
147 if record:
148 # extract database properties
149 category = CATEGORY_NAMES.index(record[2])
150 combining = int(record[3])
151 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
152 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000153 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000154 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000155 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000156 category, combining, bidirectional, mirrored, eastasianwidth,
157 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000158 )
159 # add entry to index and item tables
160 i = cache.get(item)
161 if i is None:
162 cache[item] = i = len(table)
163 table.append(item)
164 index[char] = i
165
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000166 # 2) decomposition data
167
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 decomp_data = [0]
169 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000170 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000172
Martin v. Löwis677bde22002-11-23 22:08:15 +0000173 comp_pairs = []
174 comp_first = [None] * len(unicode.chars)
175 comp_last = [None] * len(unicode.chars)
176
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000177 for char in unicode.chars:
178 record = unicode.table[char]
179 if record:
180 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000181 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000182 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000183 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 # prefix
185 if decomp[0][0] == "<":
186 prefix = decomp.pop(0)
187 else:
188 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000189 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000190 i = decomp_prefix.index(prefix)
191 except ValueError:
192 i = len(decomp_prefix)
193 decomp_prefix.append(prefix)
194 prefix = i
195 assert prefix < 256
196 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000197 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000198 # Collect NFC pairs
199 if not prefix and len(decomp) == 3 and \
200 char not in unicode.exclusions and \
201 unicode.table[decomp[1]][3] == "0":
202 p, l, r = decomp
203 comp_first[l] = 1
204 comp_last[r] = 1
205 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000206 try:
207 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000208 except ValueError:
209 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000210 decomp_data.extend(decomp)
211 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000212 else:
213 i = 0
214 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000215
Martin v. Löwis677bde22002-11-23 22:08:15 +0000216 f = l = 0
217 comp_first_ranges = []
218 comp_last_ranges = []
219 prev_f = prev_l = None
220 for i in unicode.chars:
221 if comp_first[i] is not None:
222 comp_first[i] = f
223 f += 1
224 if prev_f is None:
225 prev_f = (i,i)
226 elif prev_f[1]+1 == i:
227 prev_f = prev_f[0],i
228 else:
229 comp_first_ranges.append(prev_f)
230 prev_f = (i,i)
231 if comp_last[i] is not None:
232 comp_last[i] = l
233 l += 1
234 if prev_l is None:
235 prev_l = (i,i)
236 elif prev_l[1]+1 == i:
237 prev_l = prev_l[0],i
238 else:
239 comp_last_ranges.append(prev_l)
240 prev_l = (i,i)
241 comp_first_ranges.append(prev_f)
242 comp_last_ranges.append(prev_l)
243 total_first = f
244 total_last = l
245
246 comp_data = [0]*(total_first*total_last)
247 for f,l,char in comp_pairs:
248 f = comp_first[f]
249 l = comp_last[l]
250 comp_data[f*total_last+l] = char
251
Collin Winter6afaeb72007-08-03 17:06:41 +0000252 print(len(table), "unique properties")
253 print(len(decomp_prefix), "unique decomposition prefixes")
254 print(len(decomp_data), "unique decomposition entries:", end=' ')
255 print(decomp_size, "bytes")
256 print(total_first, "first characters in NFC")
257 print(total_last, "last characters in NFC")
258 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000259
Collin Winter6afaeb72007-08-03 17:06:41 +0000260 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000261
Fred Drake9c685052000-10-26 03:56:46 +0000262 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000263 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
264 print(file=fp)
265 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
266 print("/* a list of unique database records */", file=fp)
267 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000268 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000269 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000270 print("};", file=fp)
271 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000272
Collin Winter6afaeb72007-08-03 17:06:41 +0000273 print("/* Reindexing of NFC first characters. */", file=fp)
274 print("#define TOTAL_FIRST",total_first, file=fp)
275 print("#define TOTAL_LAST",total_last, file=fp)
276 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000277 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000278 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000279 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
280 print(" {0,0,0}", file=fp)
281 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000282 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000283 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000284 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
285 print(" {0,0,0}", file=fp)
286 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000287
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000288 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000289 # the support code moved into unicodedatabase.c
290
Collin Winter6afaeb72007-08-03 17:06:41 +0000291 print("/* string literals */", file=fp)
292 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000293 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000294 print(" \"%s\"," % name, file=fp)
295 print(" NULL", file=fp)
296 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000297
Collin Winter6afaeb72007-08-03 17:06:41 +0000298 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000299 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000300 print(" \"%s\"," % name, file=fp)
301 print(" NULL", file=fp)
302 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000303
Collin Winter6afaeb72007-08-03 17:06:41 +0000304 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000305 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000306 print(" \"%s\"," % name, file=fp)
307 print(" NULL", file=fp)
308 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000309
Collin Winter6afaeb72007-08-03 17:06:41 +0000310 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000312 print(" \"%s\"," % name, file=fp)
313 print(" NULL", file=fp)
314 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000315
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000316 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000317 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000318
Collin Winter6afaeb72007-08-03 17:06:41 +0000319 print("/* index tables for the database records */", file=fp)
320 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000321 Array("index1", index1).dump(fp, trace)
322 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000323
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000324 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000325 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000326
Collin Winter6afaeb72007-08-03 17:06:41 +0000327 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000328 Array("decomp_data", decomp_data).dump(fp, trace)
329
Collin Winter6afaeb72007-08-03 17:06:41 +0000330 print("/* index tables for the decomposition data */", file=fp)
331 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000332 Array("decomp_index1", index1).dump(fp, trace)
333 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000334
Martin v. Löwis677bde22002-11-23 22:08:15 +0000335 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000336 print("/* NFC pairs */", file=fp)
337 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000338 Array("comp_index", index).dump(fp, trace)
339 Array("comp_data", index2).dump(fp, trace)
340
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000341 # Generate delta tables for old versions
342 for version, table, normalization in unicode.changed:
343 cversion = version.replace(".","_")
344 records = [table[0]]
345 cache = {table[0]:0}
346 index = [0] * len(table)
347 for i, record in enumerate(table):
348 try:
349 index[i] = cache[record]
350 except KeyError:
351 index[i] = cache[record] = len(records)
352 records.append(record)
353 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000354 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000356 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
357 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000358 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
359 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000360 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
361 print("{", file=fp)
362 print("\tint index;", file=fp)
363 print("\tif (n >= 0x110000) index = 0;", file=fp)
364 print("\telse {", file=fp)
365 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
366 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
367 (cversion, shift, ((1<<shift)-1)), file=fp)
368 print("\t}", file=fp)
369 print("\treturn change_records_%s+index;" % cversion, file=fp)
370 print("}\n", file=fp)
371 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
372 print("{", file=fp)
373 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000374 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000375 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
376 print("\tdefault: return 0;", file=fp)
377 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000378
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000379 fp.close()
380
381# --------------------------------------------------------------------
382# unicode character type tables
383
384def makeunicodetype(unicode, trace):
385
386 FILE = "Objects/unicodetype_db.h"
387
Collin Winter6afaeb72007-08-03 17:06:41 +0000388 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000389
390 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000391 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000392 table = [dummy]
393 cache = {0: dummy}
394 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000395 numeric = {}
396 spaces = []
397 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500398 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000399
400 for char in unicode.chars:
401 record = unicode.table[char]
402 if record:
403 # extract database properties
404 category = record[2]
405 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000406 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000407 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000408 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000409 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
410 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500411 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000412 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000413 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000414 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000415 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000416 if category == "Zs" or bidirectional in ("WS", "B", "S"):
417 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000418 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000419 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000420 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500421 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000422 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000423 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000424 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000425 if "XID_Start" in properties:
426 flags |= XID_START_MASK
427 if "XID_Continue" in properties:
428 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500429 if "Cased" in properties:
430 flags |= CASED_MASK
431 if "Case_Ignorable" in properties:
432 flags |= CASE_IGNORABLE_MASK
433 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500434 cf = unicode.case_folding.get(char, [char])
435 if record[12]:
436 upper = int(record[12], 16)
437 else:
438 upper = char
439 if record[13]:
440 lower = int(record[13], 16)
441 else:
442 lower = char
443 if record[14]:
444 title = int(record[14], 16)
445 else:
446 title = upper
447 if sc is None and cf != [lower]:
448 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500449 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500450 if upper == lower == title:
451 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500452 else:
453 upper = upper - char
454 lower = lower - char
455 title = title - char
456 assert (abs(upper) <= 2147483647 and
457 abs(lower) <= 2147483647 and
458 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000459 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500460 # This happens either when some character maps to more than one
461 # character in uppercase, lowercase, or titlecase or the
462 # casefolded version of the character is different from the
463 # lowercase. The extra characters are stored in a different
464 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500465 flags |= EXTENDED_CASE_MASK
466 lower = len(extra_casing) | (len(sc[0]) << 24)
467 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500468 if cf != sc[0]:
469 lower |= len(cf) << 20
470 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500471 upper = len(extra_casing) | (len(sc[2]) << 24)
472 extra_casing.extend(sc[2])
473 # Title is probably equal to upper.
474 if sc[1] == sc[2]:
475 title = upper
476 else:
477 title = len(extra_casing) | (len(sc[1]) << 24)
478 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000479 # decimal digit, integer digit
480 decimal = 0
481 if record[6]:
482 flags |= DECIMAL_MASK
483 decimal = int(record[6])
484 digit = 0
485 if record[7]:
486 flags |= DIGIT_MASK
487 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000488 if record[8]:
489 flags |= NUMERIC_MASK
490 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000491 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000492 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000493 )
494 # add entry to index and item tables
495 i = cache.get(item)
496 if i is None:
497 cache[item] = i = len(table)
498 table.append(item)
499 index[char] = i
500
Collin Winter6afaeb72007-08-03 17:06:41 +0000501 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000502 print(sum(map(len, numeric.values())), "numeric code points")
503 print(len(spaces), "whitespace code points")
504 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500505 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000506
Collin Winter6afaeb72007-08-03 17:06:41 +0000507 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000508
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000509 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000510 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
511 print(file=fp)
512 print("/* a list of unique character type descriptors */", file=fp)
513 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000514 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000515 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
516 print("};", file=fp)
517 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000518
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500519 print("/* extended case mappings */", file=fp)
520 print(file=fp)
521 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
522 for c in extra_casing:
523 print(" %d," % c, file=fp)
524 print("};", file=fp)
525 print(file=fp)
526
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000527 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000528 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000529
Collin Winter6afaeb72007-08-03 17:06:41 +0000530 print("/* type indexes */", file=fp)
531 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000532 Array("index1", index1).dump(fp, trace)
533 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000534
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000535 # Generate code for _PyUnicode_ToNumeric()
536 numeric_items = sorted(numeric.items())
537 print('/* Returns the numeric value as double for Unicode characters', file=fp)
538 print(' * having this property, -1.0 otherwise.', file=fp)
539 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000540 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000541 print('{', file=fp)
542 print(' switch (ch) {', file=fp)
543 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000544 # Turn text into float literals
545 parts = value.split('/')
546 parts = [repr(float(part)) for part in parts]
547 value = '/'.join(parts)
548
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000549 codepoints.sort()
550 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000553 print(' }', file=fp)
554 print(' return -1.0;', file=fp)
555 print('}', file=fp)
556 print(file=fp)
557
558 # Generate code for _PyUnicode_IsWhitespace()
559 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
560 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
561 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200562 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000564 print(' switch (ch) {', file=fp)
565
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000566 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000567 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000568 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000569
570 print(' }', file=fp)
571 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000572 print('}', file=fp)
573 print(file=fp)
574
575 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000576 print("/* Returns 1 for Unicode characters having the line break", file=fp)
577 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
578 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000579 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200580 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000581 print('{', file=fp)
582 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000583 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000584 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000585 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000586
587 print(' }', file=fp)
588 print(' return 0;', file=fp)
589 print('}', file=fp)
590 print(file=fp)
591
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000592 fp.close()
593
594# --------------------------------------------------------------------
595# unicode name database
596
597def makeunicodename(unicode, trace):
598
599 FILE = "Modules/unicodename_db.h"
600
Collin Winter6afaeb72007-08-03 17:06:41 +0000601 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000602
603 # collect names
604 names = [None] * len(unicode.chars)
605
606 for char in unicode.chars:
607 record = unicode.table[char]
608 if record:
609 name = record[1].strip()
610 if name and name[0] != "<":
611 names[char] = name + chr(0)
612
Jon Dufresne39726282017-05-18 07:35:54 -0700613 print(len([n for n in names if n is not None]), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000614
615 # collect unique words from names (note that we differ between
616 # words inside a sentence, and words ending a sentence. the
617 # latter includes the trailing null byte.
618
619 words = {}
620 n = b = 0
621 for char in unicode.chars:
622 name = names[char]
623 if name:
624 w = name.split()
625 b = b + len(name)
626 n = n + len(w)
627 for w in w:
628 l = words.get(w)
629 if l:
630 l.append(None)
631 else:
632 words[w] = [len(words)]
633
Collin Winter6afaeb72007-08-03 17:06:41 +0000634 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000635
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000636 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000637
Martin v. Löwis97225da2002-11-24 23:05:09 +0000638 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000639 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000640 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000641 return -len(alist), aword
642 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000643
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000644 # figure out how many phrasebook escapes we need
645 escapes = 0
646 while escapes * 256 < len(wordlist):
647 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000648 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000649
650 short = 256 - escapes
651
652 assert short > 0
653
Collin Winter6afaeb72007-08-03 17:06:41 +0000654 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000655
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000656 # statistics
657 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000658 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000659 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000660 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000661
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000662 # pick the most commonly used words, and sort the rest on falling
663 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000664
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000665 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000666 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000667 wordlist.extend(wordtail)
668
669 # generate lexicon from words
670
671 lexicon_offset = [0]
672 lexicon = ""
673 words = {}
674
675 # build a lexicon string
676 offset = 0
677 for w, x in wordlist:
678 # encoding: bit 7 indicates last character in word (chr(128)
679 # indicates the last character in an entire string)
680 ww = w[:-1] + chr(ord(w[-1])+128)
681 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000682 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000683 if o < 0:
684 o = offset
685 lexicon = lexicon + ww
686 offset = offset + len(w)
687 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000688 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000689
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000690 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000691
692 # generate phrasebook from names and lexicon
693 phrasebook = [0]
694 phrasebook_offset = [0] * len(unicode.chars)
695 for char in unicode.chars:
696 name = names[char]
697 if name:
698 w = name.split()
699 phrasebook_offset[char] = len(phrasebook)
700 for w in w:
701 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000702 if i < short:
703 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000704 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000705 # store as two bytes
706 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000707 phrasebook.append(i&255)
708
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000709 assert getsize(phrasebook) == 1
710
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000711 #
712 # unicode name hash table
713
714 # extract names
715 data = []
716 for char in unicode.chars:
717 record = unicode.table[char]
718 if record:
719 name = record[1].strip()
720 if name and name[0] != "<":
721 data.append((name, char))
722
723 # the magic number 47 was chosen to minimize the number of
724 # collisions on the current data set. if you like, change it
725 # and see what happens...
726
727 codehash = Hash("code", data, 47)
728
Collin Winter6afaeb72007-08-03 17:06:41 +0000729 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000730
731 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000732 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
733 print(file=fp)
734 print("#define NAME_MAXLEN", 256, file=fp)
735 print(file=fp)
736 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000737 Array("lexicon", lexicon).dump(fp, trace)
738 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000739
740 # split decomposition index table
741 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
742
Collin Winter6afaeb72007-08-03 17:06:41 +0000743 print("/* code->name phrasebook */", file=fp)
744 print("#define phrasebook_shift", shift, file=fp)
745 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000746
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000747 Array("phrasebook", phrasebook).dump(fp, trace)
748 Array("phrasebook_offset1", offset1).dump(fp, trace)
749 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000750
Collin Winter6afaeb72007-08-03 17:06:41 +0000751 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000752 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000753
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300754 print(file=fp)
755 print('static const unsigned int aliases_start = %#x;' %
756 NAME_ALIASES_START, file=fp)
757 print('static const unsigned int aliases_end = %#x;' %
758 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
759
760 print('static const unsigned int name_aliases[] = {', file=fp)
761 for name, codepoint in unicode.aliases:
762 print(' 0x%04X,' % codepoint, file=fp)
763 print('};', file=fp)
764
765 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
766 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
767 # sequences or sequences with non-BMP chars are added.
768 # unicodedata_lookup should be adapted too.
769 print(dedent("""
770 typedef struct NamedSequence {
771 int seqlen;
772 Py_UCS2 seq[4];
773 } named_sequence;
774 """), file=fp)
775
776 print('static const unsigned int named_sequences_start = %#x;' %
777 NAMED_SEQUENCES_START, file=fp)
778 print('static const unsigned int named_sequences_end = %#x;' %
779 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
780
781 print('static const named_sequence named_sequences[] = {', file=fp)
782 for name, sequence in unicode.named_sequences:
783 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
784 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
785 print('};', file=fp)
786
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000787 fp.close()
788
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000789
790def merge_old_version(version, new, old):
791 # Changes to exclusion file not implemented yet
792 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000793 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000794
795 # In these change records, 0xFF means "no change"
796 bidir_changes = [0xFF]*0x110000
797 category_changes = [0xFF]*0x110000
798 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000799 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700800 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000801 # In numeric data, 0 means "no change",
802 # -1 means "did not have a numeric value
803 numeric_changes = [0] * 0x110000
804 # normalization_changes is a list of key-value pairs
805 normalization_changes = []
806 for i in range(0x110000):
807 if new.table[i] is None:
808 # Characters unassigned in the new version ought to
809 # be unassigned in the old one
810 assert old.table[i] is None
811 continue
812 # check characters unassigned in the old version
813 if old.table[i] is None:
814 # category 0 is "unassigned"
815 category_changes[i] = 0
816 continue
817 # check characters that differ
818 if old.table[i] != new.table[i]:
819 for k in range(len(old.table[i])):
820 if old.table[i][k] != new.table[i][k]:
821 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300822 if k == 1 and i in PUA_15:
823 # the name is not set in the old.table, but in the
824 # new.table we are using it for aliases and named seq
825 assert value == ''
826 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000827 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
828 category_changes[i] = CATEGORY_NAMES.index(value)
829 elif k == 4:
830 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
831 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
832 elif k == 5:
833 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
834 # We assume that all normalization changes are in 1:1 mappings
835 assert " " not in value
836 normalization_changes.append((i, value))
837 elif k == 6:
838 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
839 # we only support changes where the old value is a single digit
840 assert value in "0123456789"
841 decimal_changes[i] = int(value)
842 elif k == 8:
843 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
844 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000845 if not value:
846 numeric_changes[i] = -1
847 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000848 numeric_changes[i] = float(value)
849 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000850 elif k == 9:
851 if value == 'Y':
852 mirrored_changes[i] = '1'
853 else:
854 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000855 elif k == 11:
856 # change to ISO comment, ignore
857 pass
858 elif k == 12:
859 # change to simple uppercase mapping; ignore
860 pass
861 elif k == 13:
862 # change to simple lowercase mapping; ignore
863 pass
864 elif k == 14:
865 # change to simple titlecase mapping; ignore
866 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700867 elif k == 15:
868 # change to east asian width
869 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000870 elif k == 16:
871 # derived property changes; not yet
872 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000873 elif k == 17:
874 # normalization quickchecks are not performed
875 # for older versions
876 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000877 else:
878 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000879 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000880 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700881 decimal_changes, mirrored_changes,
882 east_asian_width_changes,
883 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000884 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000885
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000886def open_data(template, version):
887 local = template % ('-'+version,)
888 if not os.path.exists(local):
889 import urllib.request
890 if version == '3.2.0':
891 # irregular url structure
892 url = 'http://www.unicode.org/Public/3.2-Update/' + local
893 else:
894 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
895 urllib.request.urlretrieve(url, filename=local)
896 if local.endswith('.txt'):
897 return open(local, encoding='utf-8')
898 else:
899 # Unihan.zip
900 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000901
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000902# --------------------------------------------------------------------
903# the following support code is taken from the unidb utilities
904# Copyright (c) 1999-2000 by Secret Labs AB
905
906# load a unicode-data file from disk
907
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000908class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000909 # Record structure:
910 # [ID, name, category, combining, bidi, decomp, (6)
911 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
912 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
913 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000914
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000915 def __init__(self, version,
916 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000917 expand=1,
918 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000919 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000920 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300921 with open_data(UNICODE_DATA, version) as file:
922 while 1:
923 s = file.readline()
924 if not s:
925 break
926 s = s.strip().split(";")
927 char = int(s[0], 16)
928 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000929
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000930 cjk_ranges_found = []
931
Martin v. Löwis97225da2002-11-24 23:05:09 +0000932 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000933 if expand:
934 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000935 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000936 s = table[i]
937 if s:
938 if s[1][-6:] == "First>":
939 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000940 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000941 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000942 if s[1].startswith("<CJK Ideograph"):
943 cjk_ranges_found.append((field[0],
944 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000945 s[1] = ""
946 field = None
947 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000948 f2 = field[:]
949 f2[0] = "%X" % i
950 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000951 if cjk_check and cjk_ranges != cjk_ranges_found:
952 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000953
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000954 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000955 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000956 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000957 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000958
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300959 # check for name aliases and named sequences, see #12753
960 # aliases and named sequences are not in 3.2.0
961 if version != '3.2.0':
962 self.aliases = []
963 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
964 # in order to take advantage of the compression and lookup
965 # algorithms used for the other characters
966 pua_index = NAME_ALIASES_START
967 with open_data(NAME_ALIASES, version) as file:
968 for s in file:
969 s = s.strip()
970 if not s or s.startswith('#'):
971 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500972 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300973 char = int(char, 16)
974 self.aliases.append((name, char))
975 # also store the name in the PUA 1
976 self.table[pua_index][1] = name
977 pua_index += 1
978 assert pua_index - NAME_ALIASES_START == len(self.aliases)
979
980 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300981 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300982 # in order to take advantage of the compression and lookup
983 # algorithms used for the other characters.
984
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500985 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300986 pua_index = NAMED_SEQUENCES_START
987 with open_data(NAMED_SEQUENCES, version) as file:
988 for s in file:
989 s = s.strip()
990 if not s or s.startswith('#'):
991 continue
992 name, chars = s.split(';')
993 chars = tuple(int(char, 16) for char in chars.split())
994 # check that the structure defined in makeunicodename is OK
995 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
996 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
997 "the NamedSequence struct and in unicodedata_lookup")
998 self.named_sequences.append((name, chars))
999 # also store these in the PUA 1
1000 self.table[pua_index][1] = name
1001 pua_index += 1
1002 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1003
Martin v. Löwis677bde22002-11-23 22:08:15 +00001004 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001005 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1006 for s in file:
1007 s = s.strip()
1008 if not s:
1009 continue
1010 if s[0] == '#':
1011 continue
1012 char = int(s.split()[0],16)
1013 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001014
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001015 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001016 with open_data(EASTASIAN_WIDTH, version) as file:
1017 for s in file:
1018 s = s.strip()
1019 if not s:
1020 continue
1021 if s[0] == '#':
1022 continue
1023 s = s.split()[0].split(';')
1024 if '..' in s[0]:
1025 first, last = [int(c, 16) for c in s[0].split('..')]
1026 chars = list(range(first, last+1))
1027 else:
1028 chars = [int(s[0], 16)]
1029 for char in chars:
1030 widths[char] = s[1]
1031
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001032 for i in range(0, 0x110000):
1033 if table[i] is not None:
1034 table[i].append(widths[i])
1035
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001036 for i in range(0, 0x110000):
1037 if table[i] is not None:
1038 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001039
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001040 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1041 for s in file:
1042 s = s.split('#', 1)[0].strip()
1043 if not s:
1044 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001045
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001046 r, p = s.split(";")
1047 r = r.strip()
1048 p = p.strip()
1049 if ".." in r:
1050 first, last = [int(c, 16) for c in r.split('..')]
1051 chars = list(range(first, last+1))
1052 else:
1053 chars = [int(r, 16)]
1054 for char in chars:
1055 if table[char]:
1056 # Some properties (e.g. Default_Ignorable_Code_Point)
1057 # apply to unassigned code points; ignore them
1058 table[char][-1].add(p)
1059
1060 with open_data(LINE_BREAK, version) as file:
1061 for s in file:
1062 s = s.partition('#')[0]
1063 s = [i.strip() for i in s.split(';')]
1064 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1065 continue
1066 if '..' not in s[0]:
1067 first = last = int(s[0], 16)
1068 else:
1069 first, last = [int(c, 16) for c in s[0].split('..')]
1070 for char in range(first, last+1):
1071 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001072
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001073 # We only want the quickcheck properties
1074 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1075 # Yes is the default, hence only N and M occur
1076 # In 3.2.0, the format was different (NF?_NO)
1077 # The parsing will incorrectly determine these as
1078 # "yes", however, unicodedata.c will not perform quickchecks
1079 # for older versions, and no delta records will be created.
1080 quickchecks = [0] * 0x110000
1081 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001082 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1083 for s in file:
1084 if '#' in s:
1085 s = s[:s.index('#')]
1086 s = [i.strip() for i in s.split(';')]
1087 if len(s) < 2 or s[1] not in qc_order:
1088 continue
1089 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1090 quickcheck_shift = qc_order.index(s[1])*2
1091 quickcheck <<= quickcheck_shift
1092 if '..' not in s[0]:
1093 first = last = int(s[0], 16)
1094 else:
1095 first, last = [int(c, 16) for c in s[0].split('..')]
1096 for char in range(first, last+1):
1097 assert not (quickchecks[char]>>quickcheck_shift)&3
1098 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001099 for i in range(0, 0x110000):
1100 if table[i] is not None:
1101 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001102
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001103 with open_data(UNIHAN, version) as file:
1104 zip = zipfile.ZipFile(file)
1105 if version == '3.2.0':
1106 data = zip.open('Unihan-3.2.0.txt').read()
1107 else:
1108 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001109 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001110 if not line.startswith('U+'):
1111 continue
1112 code, tag, value = line.split(None, 3)[:3]
1113 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1114 'kOtherNumeric'):
1115 continue
1116 value = value.strip().replace(',', '')
1117 i = int(code[2:], 16)
1118 # Patch the numeric field
1119 if table[i] is not None:
1120 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001121 sc = self.special_casing = {}
1122 with open_data(SPECIAL_CASING, version) as file:
1123 for s in file:
1124 s = s[:-1].split('#', 1)[0]
1125 if not s:
1126 continue
1127 data = s.split("; ")
1128 if data[4]:
1129 # We ignore all conditionals (since they depend on
1130 # languages) except for one, which is hardcoded. See
1131 # handle_capital_sigma in unicodeobject.c.
1132 continue
1133 c = int(data[0], 16)
1134 lower = [int(char, 16) for char in data[1].split()]
1135 title = [int(char, 16) for char in data[2].split()]
1136 upper = [int(char, 16) for char in data[3].split()]
1137 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001138 cf = self.case_folding = {}
1139 if version != '3.2.0':
1140 with open_data(CASE_FOLDING, version) as file:
1141 for s in file:
1142 s = s[:-1].split('#', 1)[0]
1143 if not s:
1144 continue
1145 data = s.split("; ")
1146 if data[1] in "CF":
1147 c = int(data[0], 16)
1148 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001149
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001150 def uselatin1(self):
1151 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001152 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001153
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001154# hash table tools
1155
1156# this is a straight-forward reimplementation of Python's built-in
1157# dictionary type, using a static data structure, and a custom string
1158# hash algorithm.
1159
1160def myhash(s, magic):
1161 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001162 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001163 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001164 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001165 if ix:
1166 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1167 return h
1168
1169SIZES = [
1170 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1171 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1172 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1173 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1174]
1175
1176class Hash:
1177 def __init__(self, name, data, magic):
1178 # turn a (key, value) list into a static hash table structure
1179
1180 # determine table size
1181 for size, poly in SIZES:
1182 if size > len(data):
1183 poly = size + poly
1184 break
1185 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001186 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001187
Collin Winter6afaeb72007-08-03 17:06:41 +00001188 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001189
1190 table = [None] * size
1191
1192 mask = size-1
1193
1194 n = 0
1195
1196 hash = myhash
1197
1198 # initialize hash table
1199 for key, value in data:
1200 h = hash(key, magic)
1201 i = (~h) & mask
1202 v = table[i]
1203 if v is None:
1204 table[i] = value
1205 continue
1206 incr = (h ^ (h >> 3)) & mask;
1207 if not incr:
1208 incr = mask
1209 while 1:
1210 n = n + 1
1211 i = (i + incr) & mask
1212 v = table[i]
1213 if v is None:
1214 table[i] = value
1215 break
1216 incr = incr << 1
1217 if incr > mask:
1218 incr = incr ^ poly
1219
Collin Winter6afaeb72007-08-03 17:06:41 +00001220 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001221 self.collisions = n
1222
1223 for i in range(len(table)):
1224 if table[i] is None:
1225 table[i] = 0
1226
1227 self.data = Array(name + "_hash", table)
1228 self.magic = magic
1229 self.name = name
1230 self.size = size
1231 self.poly = poly
1232
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001233 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001234 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001235 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001236 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1237 file.write("#define %s_size %d\n" % (self.name, self.size))
1238 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1239
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001240# stuff to deal with arrays of unsigned integers
1241
1242class Array:
1243
1244 def __init__(self, name, data):
1245 self.name = name
1246 self.data = data
1247
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001248 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001249 # write data to file, as a C array
1250 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001251 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001252 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001253 file.write("static ")
1254 if size == 1:
1255 file.write("unsigned char")
1256 elif size == 2:
1257 file.write("unsigned short")
1258 else:
1259 file.write("unsigned int")
1260 file.write(" " + self.name + "[] = {\n")
1261 if self.data:
1262 s = " "
1263 for item in self.data:
1264 i = str(item) + ", "
1265 if len(s) + len(i) > 78:
Benjamin Peterson279a9622017-06-22 22:31:08 -07001266 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001267 s = " " + i
1268 else:
1269 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001270 if s.strip():
Benjamin Peterson279a9622017-06-22 22:31:08 -07001271 file.write(s.rstrip() + "\n")
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001272 file.write("};\n\n")
1273
1274def getsize(data):
1275 # return smallest possible integer size for the given array
1276 maxdata = max(data)
1277 if maxdata < 256:
1278 return 1
1279 elif maxdata < 65536:
1280 return 2
1281 else:
1282 return 4
1283
Tim Peters21013482000-09-25 07:13:41 +00001284def splitbins(t, trace=0):
1285 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1286
1287 t is a sequence of ints. This function can be useful to save space if
1288 many of the ints are the same. t1 and t2 are lists of ints, and shift
1289 is an int, chosen to minimize the combined size of t1 and t2 (in C
1290 code), and where for each i in range(len(t)),
1291 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1292 where mask is a bitmask isolating the last "shift" bits.
1293
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001294 If optional arg trace is non-zero (default zero), progress info
1295 is printed to sys.stderr. The higher the value, the more info
1296 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001297 """
1298
Tim Peters21013482000-09-25 07:13:41 +00001299 if trace:
1300 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001301 print("%d+%d bins at shift %d; %d bytes" % (
1302 len(t1), len(t2), shift, bytes), file=sys.stderr)
1303 print("Size of original table:", len(t)*getsize(t), \
1304 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001305 n = len(t)-1 # last valid index
1306 maxshift = 0 # the most we can shift n and still have something left
1307 if n > 0:
1308 while n >> 1:
1309 n >>= 1
1310 maxshift += 1
1311 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001312 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001313 t = tuple(t) # so slices can be dict keys
1314 for shift in range(maxshift + 1):
1315 t1 = []
1316 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001317 size = 2**shift
1318 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001319 for i in range(0, len(t), size):
1320 bin = t[i:i+size]
1321 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001322 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001323 index = len(t2)
1324 bincache[bin] = index
1325 t2.extend(bin)
1326 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001327 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001328 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001329 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001330 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001331 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001332 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001333 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001334 t1, t2, shift = best
1335 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001336 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001337 dump(t1, t2, shift, bytes)
1338 if __debug__:
1339 # exhaustively verify that the decomposition is correct
1340 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001341 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001342 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1343 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001344
1345if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001346 maketables(1)