blob: 5d8014a5da3afe1432a7bfec899267d0bdf4e4ee [file] [log] [blame]
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001#
Fredrik Lundhe9133f72000-09-25 17:59:57 +00002# (re)generate unicode property and type databases
3#
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00004# this script converts a unicode 3.2 database file to
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00005# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundhe9133f72000-09-25 17:59:57 +000011# 2000-09-25 fl added character type table
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000012# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +000013# 2000-11-03 fl expand first/last ranges
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +000014# 2001-01-19 fl added character name tables (2.1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000015# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
Martin v. Löwis677bde22002-11-23 22:08:15 +000016# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
Martin v. Löwis97225da2002-11-24 23:05:09 +000019# 2002-11-24 mvl expand all ranges, sort names version-independently
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000020# 2002-11-25 mvl add UNIDATA_VERSION
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +000021# 2004-05-29 perky add east asian width information
Martin v. Löwis43179c82006-03-11 12:43:44 +000022# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
Georg Brandld52429f2008-07-04 15:55:02 +000023# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
Ezio Melotti931b8aa2011-10-21 21:57:36 +030024# 2011-10-21 ezio add support for name aliases and named sequences
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050025# 2012-01 benjamin add full case mappings
Fredrik Lundhcfcea492000-09-25 08:07:06 +000026#
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000027# written by Fredrik Lundh (fredrik@pythonware.com)
Fredrik Lundhf367cac2000-09-24 23:18:31 +000028#
29
Ezio Melotti931b8aa2011-10-21 21:57:36 +030030import os
31import sys
32import zipfile
33
34from textwrap import dedent
Fredrik Lundhf367cac2000-09-24 23:18:31 +000035
36SCRIPT = sys.argv[0]
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +000037VERSION = "3.2"
Fredrik Lundhf367cac2000-09-24 23:18:31 +000038
Martin v. Löwisb5c980b2002-11-25 09:13:37 +000039# The Unicode Database
R David Murray7445a382014-10-09 17:30:33 -040040# --------------------
41# When changing UCD version please update
42# * Doc/library/stdtypes.rst, and
43# * Doc/library/unicodedata.rst
R David Murray5f16f902014-10-09 20:45:59 -040044# * Doc/reference/lexical_analysis.rst (two occurrences)
Benjamin Peterson67752312016-09-14 23:53:47 -070045UNIDATA_VERSION = "9.0.0"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000046UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Martin v. Löwisbaecd722010-10-11 22:42:28 +000049UNIHAN = "Unihan%s.zip"
Martin v. Löwis13c3e382007-08-14 22:37:03 +000050DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000051DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
Florent Xicluna806d8cf2010-03-30 19:34:18 +000052LINE_BREAK = "LineBreak%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030053NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050055SPECIAL_CASING = "SpecialCasing%s.txt"
Benjamin Petersond5890c82012-01-14 13:23:30 -050056CASE_FOLDING = "CaseFolding%s.txt"
Ezio Melotti931b8aa2011-10-21 21:57:36 +030057
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
Benjamin Peterson71f660e2012-02-20 22:24:29 -050065NAMED_SEQUENCES_START = 0xF0200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066
67old_versions = ["3.2.0"]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000068
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72 "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
Benjamin Peterson94d08d92013-10-10 17:24:45 -040076 "ON", "LRI", "RLI", "FSI", "PDI" ]
Fredrik Lundhf367cac2000-09-24 23:18:31 +000077
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000078EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
Florent Xicluna806d8cf2010-03-30 19:34:18 +000080MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000082# note: should match definitions in Objects/unicodectype.c
Fredrik Lundhe9133f72000-09-25 17:59:57 +000083ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
Fredrik Lundh0f8fad42000-09-25 21:01:56 +000087LINEBREAK_MASK = 0x10
Fredrik Lundhe9133f72000-09-25 17:59:57 +000088SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
Martin v. Löwis13c3e382007-08-14 22:37:03 +000091XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
Georg Brandld52429f2008-07-04 15:55:02 +000093PRINTABLE_MASK = 0x400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050094NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
Fredrik Lundhe9133f72000-09-25 17:59:57 +000098
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +000099# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101 ('3400', '4DB5'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500102 ('4E00', '9FD5'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000103 ('20000', '2A6D6'),
104 ('2A700', '2B734'),
Benjamin Peterson48013832015-06-27 15:45:56 -0500105 ('2B740', '2B81D'),
106 ('2B820', '2CEA1'),
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000107]
108
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000109def maketables(trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000110
Collin Winter6afaeb72007-08-03 17:06:41 +0000111 print("--- Reading", UNICODE_DATA % "", "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000112
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 version = ""
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000114 unicode = UnicodeData(UNIDATA_VERSION)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000115
Georg Brandl559e5d72008-06-11 18:37:52 +0000116 print(len(list(filter(None, unicode.table))), "characters")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000117
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000118 for version in old_versions:
Collin Winter6afaeb72007-08-03 17:06:41 +0000119 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000120 old_unicode = UnicodeData(version, cjk_check=False)
Georg Brandl559e5d72008-06-11 18:37:52 +0000121 print(len(list(filter(None, old_unicode.table))), "characters")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 merge_old_version(version, unicode, old_unicode)
123
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000124 makeunicodename(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000125 makeunicodedata(unicode, trace)
Fredrik Lundhb2dfd732001-01-21 23:31:52 +0000126 makeunicodetype(unicode, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000127
128# --------------------------------------------------------------------
129# unicode character properties
130
131def makeunicodedata(unicode, trace):
132
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000133 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000134 table = [dummy]
135 cache = {0: dummy}
136 index = [0] * len(unicode.chars)
137
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000138 FILE = "Modules/unicodedata_db.h"
139
Collin Winter6afaeb72007-08-03 17:06:41 +0000140 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000141
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000142 # 1) database properties
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000143
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000144 for char in unicode.chars:
145 record = unicode.table[char]
146 if record:
147 # extract database properties
148 category = CATEGORY_NAMES.index(record[2])
149 combining = int(record[3])
150 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
151 mirrored = record[9] == "Y"
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000152 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000153 normalizationquickcheck = record[17]
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000154 item = (
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000155 category, combining, bidirectional, mirrored, eastasianwidth,
156 normalizationquickcheck
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000157 )
158 # add entry to index and item tables
159 i = cache.get(item)
160 if i is None:
161 cache[item] = i = len(table)
162 table.append(item)
163 index[char] = i
164
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000165 # 2) decomposition data
166
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000167 decomp_data = [0]
168 decomp_prefix = [""]
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000169 decomp_index = [0] * len(unicode.chars)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170 decomp_size = 0
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000171
Martin v. Löwis677bde22002-11-23 22:08:15 +0000172 comp_pairs = []
173 comp_first = [None] * len(unicode.chars)
174 comp_last = [None] * len(unicode.chars)
175
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000176 for char in unicode.chars:
177 record = unicode.table[char]
178 if record:
179 if record[5]:
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000180 decomp = record[5].split()
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000181 if len(decomp) > 19:
Collin Wintera817e582007-08-22 23:05:06 +0000182 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 # prefix
184 if decomp[0][0] == "<":
185 prefix = decomp.pop(0)
186 else:
187 prefix = ""
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000188 try:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000189 i = decomp_prefix.index(prefix)
190 except ValueError:
191 i = len(decomp_prefix)
192 decomp_prefix.append(prefix)
193 prefix = i
194 assert prefix < 256
195 # content
Georg Brandlbf82e372008-05-16 17:02:34 +0000196 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
Martin v. Löwis677bde22002-11-23 22:08:15 +0000197 # Collect NFC pairs
198 if not prefix and len(decomp) == 3 and \
199 char not in unicode.exclusions and \
200 unicode.table[decomp[1]][3] == "0":
201 p, l, r = decomp
202 comp_first[l] = 1
203 comp_last[r] = 1
204 comp_pairs.append((l,r,char))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205 try:
206 i = decomp_data.index(decomp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000207 except ValueError:
208 i = len(decomp_data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000209 decomp_data.extend(decomp)
210 decomp_size = decomp_size + len(decomp) * 2
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000211 else:
212 i = 0
213 decomp_index[char] = i
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000214
Martin v. Löwis677bde22002-11-23 22:08:15 +0000215 f = l = 0
216 comp_first_ranges = []
217 comp_last_ranges = []
218 prev_f = prev_l = None
219 for i in unicode.chars:
220 if comp_first[i] is not None:
221 comp_first[i] = f
222 f += 1
223 if prev_f is None:
224 prev_f = (i,i)
225 elif prev_f[1]+1 == i:
226 prev_f = prev_f[0],i
227 else:
228 comp_first_ranges.append(prev_f)
229 prev_f = (i,i)
230 if comp_last[i] is not None:
231 comp_last[i] = l
232 l += 1
233 if prev_l is None:
234 prev_l = (i,i)
235 elif prev_l[1]+1 == i:
236 prev_l = prev_l[0],i
237 else:
238 comp_last_ranges.append(prev_l)
239 prev_l = (i,i)
240 comp_first_ranges.append(prev_f)
241 comp_last_ranges.append(prev_l)
242 total_first = f
243 total_last = l
244
245 comp_data = [0]*(total_first*total_last)
246 for f,l,char in comp_pairs:
247 f = comp_first[f]
248 l = comp_last[l]
249 comp_data[f*total_last+l] = char
250
Collin Winter6afaeb72007-08-03 17:06:41 +0000251 print(len(table), "unique properties")
252 print(len(decomp_prefix), "unique decomposition prefixes")
253 print(len(decomp_data), "unique decomposition entries:", end=' ')
254 print(decomp_size, "bytes")
255 print(total_first, "first characters in NFC")
256 print(total_last, "last characters in NFC")
257 print(len(comp_pairs), "NFC pairs")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000258
Collin Winter6afaeb72007-08-03 17:06:41 +0000259 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000260
Fred Drake9c685052000-10-26 03:56:46 +0000261 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000262 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
263 print(file=fp)
264 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
265 print("/* a list of unique database records */", file=fp)
266 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000267 for item in table:
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000268 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
Collin Winter6afaeb72007-08-03 17:06:41 +0000269 print("};", file=fp)
270 print(file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000271
Collin Winter6afaeb72007-08-03 17:06:41 +0000272 print("/* Reindexing of NFC first characters. */", file=fp)
273 print("#define TOTAL_FIRST",total_first, file=fp)
274 print("#define TOTAL_LAST",total_last, file=fp)
275 print("struct reindex{int start;short count,index;};", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000276 print("static struct reindex nfc_first[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000277 for start,end in comp_first_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000278 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
279 print(" {0,0,0}", file=fp)
280 print("};\n", file=fp)
Martin v. Löwis59683e82008-06-13 07:50:45 +0000281 print("static struct reindex nfc_last[] = {", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000282 for start,end in comp_last_ranges:
Collin Winter6afaeb72007-08-03 17:06:41 +0000283 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
284 print(" {0,0,0}", file=fp)
285 print("};\n", file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000286
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000287 # FIXME: <fl> the following tables could be made static, and
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000288 # the support code moved into unicodedatabase.c
289
Collin Winter6afaeb72007-08-03 17:06:41 +0000290 print("/* string literals */", file=fp)
291 print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000292 for name in CATEGORY_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000293 print(" \"%s\"," % name, file=fp)
294 print(" NULL", file=fp)
295 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000296
Collin Winter6afaeb72007-08-03 17:06:41 +0000297 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000298 for name in BIDIRECTIONAL_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000299 print(" \"%s\"," % name, file=fp)
300 print(" NULL", file=fp)
301 print("};", file=fp)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000302
Collin Winter6afaeb72007-08-03 17:06:41 +0000303 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000304 for name in EASTASIANWIDTH_NAMES:
Collin Winter6afaeb72007-08-03 17:06:41 +0000305 print(" \"%s\"," % name, file=fp)
306 print(" NULL", file=fp)
307 print("};", file=fp)
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000308
Collin Winter6afaeb72007-08-03 17:06:41 +0000309 print("static const char *decomp_prefix[] = {", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000310 for name in decomp_prefix:
Collin Winter6afaeb72007-08-03 17:06:41 +0000311 print(" \"%s\"," % name, file=fp)
312 print(" NULL", file=fp)
313 print("};", file=fp)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000314
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000315 # split record index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000316 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000317
Collin Winter6afaeb72007-08-03 17:06:41 +0000318 print("/* index tables for the database records */", file=fp)
319 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000320 Array("index1", index1).dump(fp, trace)
321 Array("index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000322
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000323 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000324 index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundhcfcea492000-09-25 08:07:06 +0000325
Collin Winter6afaeb72007-08-03 17:06:41 +0000326 print("/* decomposition data */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327 Array("decomp_data", decomp_data).dump(fp, trace)
328
Collin Winter6afaeb72007-08-03 17:06:41 +0000329 print("/* index tables for the decomposition data */", file=fp)
330 print("#define DECOMP_SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000331 Array("decomp_index1", index1).dump(fp, trace)
332 Array("decomp_index2", index2).dump(fp, trace)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000333
Martin v. Löwis677bde22002-11-23 22:08:15 +0000334 index, index2, shift = splitbins(comp_data, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000335 print("/* NFC pairs */", file=fp)
336 print("#define COMP_SHIFT", shift, file=fp)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000337 Array("comp_index", index).dump(fp, trace)
338 Array("comp_data", index2).dump(fp, trace)
339
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000340 # Generate delta tables for old versions
341 for version, table, normalization in unicode.changed:
342 cversion = version.replace(".","_")
343 records = [table[0]]
344 cache = {table[0]:0}
345 index = [0] * len(table)
346 for i, record in enumerate(table):
347 try:
348 index[i] = cache[record]
349 except KeyError:
350 index[i] = cache[record] = len(records)
351 records.append(record)
352 index1, index2, shift = splitbins(index, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000353 print("static const change_record change_records_%s[] = {" % cversion, file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 for record in records:
Collin Winter6afaeb72007-08-03 17:06:41 +0000355 print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
356 print("};", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
358 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
Collin Winter6afaeb72007-08-03 17:06:41 +0000359 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
360 print("{", file=fp)
361 print("\tint index;", file=fp)
362 print("\tif (n >= 0x110000) index = 0;", file=fp)
363 print("\telse {", file=fp)
364 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
365 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
366 (cversion, shift, ((1<<shift)-1)), file=fp)
367 print("\t}", file=fp)
368 print("\treturn change_records_%s+index;" % cversion, file=fp)
369 print("}\n", file=fp)
370 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
371 print("{", file=fp)
372 print("\tswitch(n) {", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000373 for k, v in normalization:
Collin Winter6afaeb72007-08-03 17:06:41 +0000374 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
375 print("\tdefault: return 0;", file=fp)
376 print("\t}\n}\n", file=fp)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000378 fp.close()
379
380# --------------------------------------------------------------------
381# unicode character type tables
382
383def makeunicodetype(unicode, trace):
384
385 FILE = "Objects/unicodetype_db.h"
386
Collin Winter6afaeb72007-08-03 17:06:41 +0000387 print("--- Preparing", FILE, "...")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000388
389 # extract unicode types
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000390 dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000391 table = [dummy]
392 cache = {0: dummy}
393 index = [0] * len(unicode.chars)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000394 numeric = {}
395 spaces = []
396 linebreaks = []
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500397 extra_casing = []
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000398
399 for char in unicode.chars:
400 record = unicode.table[char]
401 if record:
402 # extract database properties
403 category = record[2]
404 bidirectional = record[4]
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000405 properties = record[16]
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000406 flags = 0
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000407 delta = True
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000408 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
409 flags |= ALPHA_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500410 if "Lowercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000411 flags |= LOWER_MASK
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000412 if 'Line_Break' in properties or bidirectional == "B":
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000413 flags |= LINEBREAK_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000414 linebreaks.append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000415 if category == "Zs" or bidirectional in ("WS", "B", "S"):
416 flags |= SPACE_MASK
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000417 spaces.append(char)
Fredrik Lundh375732c2000-09-25 23:03:34 +0000418 if category == "Lt":
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000419 flags |= TITLE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500420 if "Uppercase" in properties:
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000421 flags |= UPPER_MASK
Benjamin Peterson09832742009-03-26 17:15:46 +0000422 if char == ord(" ") or category[0] not in ("C", "Z"):
Georg Brandld52429f2008-07-04 15:55:02 +0000423 flags |= PRINTABLE_MASK
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000424 if "XID_Start" in properties:
425 flags |= XID_START_MASK
426 if "XID_Continue" in properties:
427 flags |= XID_CONTINUE_MASK
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500428 if "Cased" in properties:
429 flags |= CASED_MASK
430 if "Case_Ignorable" in properties:
431 flags |= CASE_IGNORABLE_MASK
432 sc = unicode.special_casing.get(char)
Benjamin Petersond5890c82012-01-14 13:23:30 -0500433 cf = unicode.case_folding.get(char, [char])
434 if record[12]:
435 upper = int(record[12], 16)
436 else:
437 upper = char
438 if record[13]:
439 lower = int(record[13], 16)
440 else:
441 lower = char
442 if record[14]:
443 title = int(record[14], 16)
444 else:
445 title = upper
446 if sc is None and cf != [lower]:
447 sc = ([lower], [title], [upper])
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500448 if sc is None:
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500449 if upper == lower == title:
450 upper = lower = title = 0
Benjamin Petersonad9c5692012-01-15 21:19:20 -0500451 else:
452 upper = upper - char
453 lower = lower - char
454 title = title - char
455 assert (abs(upper) <= 2147483647 and
456 abs(lower) <= 2147483647 and
457 abs(title) <= 2147483647)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000458 else:
Benjamin Petersond5890c82012-01-14 13:23:30 -0500459 # This happens either when some character maps to more than one
460 # character in uppercase, lowercase, or titlecase or the
461 # casefolded version of the character is different from the
462 # lowercase. The extra characters are stored in a different
463 # array.
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500464 flags |= EXTENDED_CASE_MASK
465 lower = len(extra_casing) | (len(sc[0]) << 24)
466 extra_casing.extend(sc[0])
Benjamin Petersond5890c82012-01-14 13:23:30 -0500467 if cf != sc[0]:
468 lower |= len(cf) << 20
469 extra_casing.extend(cf)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500470 upper = len(extra_casing) | (len(sc[2]) << 24)
471 extra_casing.extend(sc[2])
472 # Title is probably equal to upper.
473 if sc[1] == sc[2]:
474 title = upper
475 else:
476 title = len(extra_casing) | (len(sc[1]) << 24)
477 extra_casing.extend(sc[1])
Fredrik Lundh0f8fad42000-09-25 21:01:56 +0000478 # decimal digit, integer digit
479 decimal = 0
480 if record[6]:
481 flags |= DECIMAL_MASK
482 decimal = int(record[6])
483 digit = 0
484 if record[7]:
485 flags |= DIGIT_MASK
486 digit = int(record[7])
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000487 if record[8]:
488 flags |= NUMERIC_MASK
489 numeric.setdefault(record[8], []).append(char)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000490 item = (
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000491 upper, lower, title, decimal, digit, flags
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000492 )
493 # add entry to index and item tables
494 i = cache.get(item)
495 if i is None:
496 cache[item] = i = len(table)
497 table.append(item)
498 index[char] = i
499
Collin Winter6afaeb72007-08-03 17:06:41 +0000500 print(len(table), "unique character type entries")
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000501 print(sum(map(len, numeric.values())), "numeric code points")
502 print(len(spaces), "whitespace code points")
503 print(len(linebreaks), "linebreak code points")
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500504 print(len(extra_casing), "extended case array")
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000505
Collin Winter6afaeb72007-08-03 17:06:41 +0000506 print("--- Writing", FILE, "...")
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000507
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000508 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000509 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
510 print(file=fp)
511 print("/* a list of unique character type descriptors */", file=fp)
512 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000513 for item in table:
Collin Winter6afaeb72007-08-03 17:06:41 +0000514 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
515 print("};", file=fp)
516 print(file=fp)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000517
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -0500518 print("/* extended case mappings */", file=fp)
519 print(file=fp)
520 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
521 for c in extra_casing:
522 print(" %d," % c, file=fp)
523 print("};", file=fp)
524 print(file=fp)
525
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000526 # split decomposition index table
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000527 index1, index2, shift = splitbins(index, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000528
Collin Winter6afaeb72007-08-03 17:06:41 +0000529 print("/* type indexes */", file=fp)
530 print("#define SHIFT", shift, file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000531 Array("index1", index1).dump(fp, trace)
532 Array("index2", index2).dump(fp, trace)
Fredrik Lundhe9133f72000-09-25 17:59:57 +0000533
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000534 # Generate code for _PyUnicode_ToNumeric()
535 numeric_items = sorted(numeric.items())
536 print('/* Returns the numeric value as double for Unicode characters', file=fp)
537 print(' * having this property, -1.0 otherwise.', file=fp)
538 print(' */', file=fp)
Amaury Forgeot d'Arc324ac652010-08-18 20:44:58 +0000539 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000540 print('{', file=fp)
541 print(' switch (ch) {', file=fp)
542 for value, codepoints in numeric_items:
Amaury Forgeot d'Arc919765a2009-10-13 23:18:53 +0000543 # Turn text into float literals
544 parts = value.split('/')
545 parts = [repr(float(part)) for part in parts]
546 value = '/'.join(parts)
547
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000548 codepoints.sort()
549 for codepoint in codepoints:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000550 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000551 print(' return (double) %s;' % (value,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000552 print(' }', file=fp)
553 print(' return -1.0;', file=fp)
554 print('}', file=fp)
555 print(file=fp)
556
557 # Generate code for _PyUnicode_IsWhitespace()
558 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
559 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
560 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200561 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000562 print('{', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000563 print(' switch (ch) {', file=fp)
564
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000565 for codepoint in sorted(spaces):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000566 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000567 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000568
569 print(' }', file=fp)
570 print(' return 0;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000571 print('}', file=fp)
572 print(file=fp)
573
574 # Generate code for _PyUnicode_IsLinebreak()
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000575 print("/* Returns 1 for Unicode characters having the line break", file=fp)
576 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
577 print(" * type 'B', 0 otherwise.", file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000578 print(" */", file=fp)
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200579 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000580 print('{', file=fp)
581 print(' switch (ch) {', file=fp)
Florent Xiclunaf089fd62010-03-19 14:25:03 +0000582 for codepoint in sorted(linebreaks):
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000583 print(' case 0x%04X:' % (codepoint,), file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000584 print(' return 1;', file=fp)
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000585
586 print(' }', file=fp)
587 print(' return 0;', file=fp)
588 print('}', file=fp)
589 print(file=fp)
590
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000591 fp.close()
592
593# --------------------------------------------------------------------
594# unicode name database
595
596def makeunicodename(unicode, trace):
597
598 FILE = "Modules/unicodename_db.h"
599
Collin Winter6afaeb72007-08-03 17:06:41 +0000600 print("--- Preparing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000601
602 # collect names
603 names = [None] * len(unicode.chars)
604
605 for char in unicode.chars:
606 record = unicode.table[char]
607 if record:
608 name = record[1].strip()
609 if name and name[0] != "<":
610 names[char] = name + chr(0)
611
Georg Brandl559e5d72008-06-11 18:37:52 +0000612 print(len(list(n for n in names if n is not None)), "distinct names")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000613
614 # collect unique words from names (note that we differ between
615 # words inside a sentence, and words ending a sentence. the
616 # latter includes the trailing null byte.
617
618 words = {}
619 n = b = 0
620 for char in unicode.chars:
621 name = names[char]
622 if name:
623 w = name.split()
624 b = b + len(name)
625 n = n + len(w)
626 for w in w:
627 l = words.get(w)
628 if l:
629 l.append(None)
630 else:
631 words[w] = [len(words)]
632
Collin Winter6afaeb72007-08-03 17:06:41 +0000633 print(n, "words in text;", b, "bytes")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000634
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000635 wordlist = list(words.items())
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000636
Martin v. Löwis97225da2002-11-24 23:05:09 +0000637 # sort on falling frequency, then by name
Mark Dickinsona56c4672009-01-27 18:17:45 +0000638 def word_key(a):
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000639 aword, alist = a
Mark Dickinsona56c4672009-01-27 18:17:45 +0000640 return -len(alist), aword
641 wordlist.sort(key=word_key)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000642
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000643 # figure out how many phrasebook escapes we need
644 escapes = 0
645 while escapes * 256 < len(wordlist):
646 escapes = escapes + 1
Collin Winter6afaeb72007-08-03 17:06:41 +0000647 print(escapes, "escapes")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000648
649 short = 256 - escapes
650
651 assert short > 0
652
Collin Winter6afaeb72007-08-03 17:06:41 +0000653 print(short, "short indexes in lexicon")
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000654
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000655 # statistics
656 n = 0
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000657 for i in range(short):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000658 n = n + len(wordlist[i][1])
Collin Winter6afaeb72007-08-03 17:06:41 +0000659 print(n, "short indexes in phrasebook")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000660
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000661 # pick the most commonly used words, and sort the rest on falling
662 # length (to maximize overlap)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000663
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000664 wordlist, wordtail = wordlist[:short], wordlist[short:]
Raymond Hettingerd4cb56d2008-01-30 02:55:10 +0000665 wordtail.sort(key=lambda a: a[0], reverse=True)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000666 wordlist.extend(wordtail)
667
668 # generate lexicon from words
669
670 lexicon_offset = [0]
671 lexicon = ""
672 words = {}
673
674 # build a lexicon string
675 offset = 0
676 for w, x in wordlist:
677 # encoding: bit 7 indicates last character in word (chr(128)
678 # indicates the last character in an entire string)
679 ww = w[:-1] + chr(ord(w[-1])+128)
680 # reuse string tails, when possible
Walter Dörwaldaaab30e2002-09-11 20:36:02 +0000681 o = lexicon.find(ww)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000682 if o < 0:
683 o = offset
684 lexicon = lexicon + ww
685 offset = offset + len(w)
686 words[w] = len(lexicon_offset)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000687 lexicon_offset.append(o)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000688
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000689 lexicon = list(map(ord, lexicon))
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000690
691 # generate phrasebook from names and lexicon
692 phrasebook = [0]
693 phrasebook_offset = [0] * len(unicode.chars)
694 for char in unicode.chars:
695 name = names[char]
696 if name:
697 w = name.split()
698 phrasebook_offset[char] = len(phrasebook)
699 for w in w:
700 i = words[w]
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000701 if i < short:
702 phrasebook.append(i)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000703 else:
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000704 # store as two bytes
705 phrasebook.append((i>>8) + short)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000706 phrasebook.append(i&255)
707
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000708 assert getsize(phrasebook) == 1
709
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000710 #
711 # unicode name hash table
712
713 # extract names
714 data = []
715 for char in unicode.chars:
716 record = unicode.table[char]
717 if record:
718 name = record[1].strip()
719 if name and name[0] != "<":
720 data.append((name, char))
721
722 # the magic number 47 was chosen to minimize the number of
723 # collisions on the current data set. if you like, change it
724 # and see what happens...
725
726 codehash = Hash("code", data, 47)
727
Collin Winter6afaeb72007-08-03 17:06:41 +0000728 print("--- Writing", FILE, "...")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000729
730 fp = open(FILE, "w")
Collin Winter6afaeb72007-08-03 17:06:41 +0000731 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
732 print(file=fp)
733 print("#define NAME_MAXLEN", 256, file=fp)
734 print(file=fp)
735 print("/* lexicon */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000736 Array("lexicon", lexicon).dump(fp, trace)
737 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000738
739 # split decomposition index table
740 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
741
Collin Winter6afaeb72007-08-03 17:06:41 +0000742 print("/* code->name phrasebook */", file=fp)
743 print("#define phrasebook_shift", shift, file=fp)
744 print("#define phrasebook_short", short, file=fp)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000745
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000746 Array("phrasebook", phrasebook).dump(fp, trace)
747 Array("phrasebook_offset1", offset1).dump(fp, trace)
748 Array("phrasebook_offset2", offset2).dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000749
Collin Winter6afaeb72007-08-03 17:06:41 +0000750 print("/* name->code dictionary */", file=fp)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000751 codehash.dump(fp, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000752
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300753 print(file=fp)
754 print('static const unsigned int aliases_start = %#x;' %
755 NAME_ALIASES_START, file=fp)
756 print('static const unsigned int aliases_end = %#x;' %
757 (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
758
759 print('static const unsigned int name_aliases[] = {', file=fp)
760 for name, codepoint in unicode.aliases:
761 print(' 0x%04X,' % codepoint, file=fp)
762 print('};', file=fp)
763
764 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
765 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
766 # sequences or sequences with non-BMP chars are added.
767 # unicodedata_lookup should be adapted too.
768 print(dedent("""
769 typedef struct NamedSequence {
770 int seqlen;
771 Py_UCS2 seq[4];
772 } named_sequence;
773 """), file=fp)
774
775 print('static const unsigned int named_sequences_start = %#x;' %
776 NAMED_SEQUENCES_START, file=fp)
777 print('static const unsigned int named_sequences_end = %#x;' %
778 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
779
780 print('static const named_sequence named_sequences[] = {', file=fp)
781 for name, sequence in unicode.named_sequences:
782 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
783 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
784 print('};', file=fp)
785
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +0000786 fp.close()
787
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788
789def merge_old_version(version, new, old):
790 # Changes to exclusion file not implemented yet
791 if old.exclusions != new.exclusions:
Collin Wintera817e582007-08-22 23:05:06 +0000792 raise NotImplementedError("exclusions differ")
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000793
794 # In these change records, 0xFF means "no change"
795 bidir_changes = [0xFF]*0x110000
796 category_changes = [0xFF]*0x110000
797 decimal_changes = [0xFF]*0x110000
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000798 mirrored_changes = [0xFF]*0x110000
Benjamin Peterson67752312016-09-14 23:53:47 -0700799 east_asian_width_changes = [0xFF]*0x110000
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000800 # In numeric data, 0 means "no change",
801 # -1 means "did not have a numeric value
802 numeric_changes = [0] * 0x110000
803 # normalization_changes is a list of key-value pairs
804 normalization_changes = []
805 for i in range(0x110000):
806 if new.table[i] is None:
807 # Characters unassigned in the new version ought to
808 # be unassigned in the old one
809 assert old.table[i] is None
810 continue
811 # check characters unassigned in the old version
812 if old.table[i] is None:
813 # category 0 is "unassigned"
814 category_changes[i] = 0
815 continue
816 # check characters that differ
817 if old.table[i] != new.table[i]:
818 for k in range(len(old.table[i])):
819 if old.table[i][k] != new.table[i][k]:
820 value = old.table[i][k]
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300821 if k == 1 and i in PUA_15:
822 # the name is not set in the old.table, but in the
823 # new.table we are using it for aliases and named seq
824 assert value == ''
825 elif k == 2:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000826 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
827 category_changes[i] = CATEGORY_NAMES.index(value)
828 elif k == 4:
829 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
830 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
831 elif k == 5:
832 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
833 # We assume that all normalization changes are in 1:1 mappings
834 assert " " not in value
835 normalization_changes.append((i, value))
836 elif k == 6:
837 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
838 # we only support changes where the old value is a single digit
839 assert value in "0123456789"
840 decimal_changes[i] = int(value)
841 elif k == 8:
842 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
843 # Since 0 encodes "no change", the old value is better not 0
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000844 if not value:
845 numeric_changes[i] = -1
846 else:
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +0000847 numeric_changes[i] = float(value)
848 assert numeric_changes[i] not in (0, -1)
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000849 elif k == 9:
850 if value == 'Y':
851 mirrored_changes[i] = '1'
852 else:
853 mirrored_changes[i] = '0'
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000854 elif k == 11:
855 # change to ISO comment, ignore
856 pass
857 elif k == 12:
858 # change to simple uppercase mapping; ignore
859 pass
860 elif k == 13:
861 # change to simple lowercase mapping; ignore
862 pass
863 elif k == 14:
864 # change to simple titlecase mapping; ignore
865 pass
Benjamin Peterson67752312016-09-14 23:53:47 -0700866 elif k == 15:
867 # change to east asian width
868 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000869 elif k == 16:
870 # derived property changes; not yet
871 pass
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000872 elif k == 17:
873 # normalization quickchecks are not performed
874 # for older versions
875 pass
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000876 else:
877 class Difference(Exception):pass
Collin Wintera817e582007-08-22 23:05:06 +0000878 raise Difference(hex(i), k, old.table[i], new.table[i])
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000879 new.changed.append((version, list(zip(bidir_changes, category_changes,
Benjamin Peterson67752312016-09-14 23:53:47 -0700880 decimal_changes, mirrored_changes,
881 east_asian_width_changes,
882 numeric_changes)),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000883 normalization_changes))
Tim Peters88ca4672006-03-10 23:39:56 +0000884
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000885def open_data(template, version):
886 local = template % ('-'+version,)
887 if not os.path.exists(local):
888 import urllib.request
889 if version == '3.2.0':
890 # irregular url structure
891 url = 'http://www.unicode.org/Public/3.2-Update/' + local
892 else:
893 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
894 urllib.request.urlretrieve(url, filename=local)
895 if local.endswith('.txt'):
896 return open(local, encoding='utf-8')
897 else:
898 # Unihan.zip
899 return open(local, 'rb')
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000900
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000901# --------------------------------------------------------------------
902# the following support code is taken from the unidb utilities
903# Copyright (c) 1999-2000 by Secret Labs AB
904
905# load a unicode-data file from disk
906
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000907class UnicodeData:
Martin v. Löwis13c3e382007-08-14 22:37:03 +0000908 # Record structure:
909 # [ID, name, category, combining, bidi, decomp, (6)
910 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
911 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
912 # derived-props] (17)
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000913
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000914 def __init__(self, version,
915 linebreakprops=False,
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000916 expand=1,
917 cjk_check=True):
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000918 self.changed = []
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000919 table = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +0300920 with open_data(UNICODE_DATA, version) as file:
921 while 1:
922 s = file.readline()
923 if not s:
924 break
925 s = s.strip().split(";")
926 char = int(s[0], 16)
927 table[char] = s
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000928
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000929 cjk_ranges_found = []
930
Martin v. Löwis97225da2002-11-24 23:05:09 +0000931 # expand first-last ranges
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000932 if expand:
933 field = None
Martin v. Löwis97225da2002-11-24 23:05:09 +0000934 for i in range(0, 0x110000):
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000935 s = table[i]
936 if s:
937 if s[1][-6:] == "First>":
938 s[1] = ""
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000939 field = s
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000940 elif s[1][-5:] == "Last>":
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000941 if s[1].startswith("<CJK Ideograph"):
942 cjk_ranges_found.append((field[0],
943 s[0]))
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000944 s[1] = ""
945 field = None
946 elif field:
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000947 f2 = field[:]
948 f2[0] = "%X" % i
949 table[i] = f2
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000950 if cjk_check and cjk_ranges != cjk_ranges_found:
951 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +0000952
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000953 # public attributes
Martin v. Löwisbaecd722010-10-11 22:42:28 +0000954 self.filename = UNICODE_DATA % ''
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000955 self.table = table
Georg Brandlbf82e372008-05-16 17:02:34 +0000956 self.chars = list(range(0x110000)) # unicode 3.2
Fredrik Lundhf367cac2000-09-24 23:18:31 +0000957
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300958 # check for name aliases and named sequences, see #12753
959 # aliases and named sequences are not in 3.2.0
960 if version != '3.2.0':
961 self.aliases = []
962 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
963 # in order to take advantage of the compression and lookup
964 # algorithms used for the other characters
965 pua_index = NAME_ALIASES_START
966 with open_data(NAME_ALIASES, version) as file:
967 for s in file:
968 s = s.strip()
969 if not s or s.startswith('#'):
970 continue
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500971 char, name, abbrev = s.split(';')
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300972 char = int(char, 16)
973 self.aliases.append((name, char))
974 # also store the name in the PUA 1
975 self.table[pua_index][1] = name
976 pua_index += 1
977 assert pua_index - NAME_ALIASES_START == len(self.aliases)
978
979 self.named_sequences = []
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300980 # store named sequences in the PUA 1, in range U+F0100..,
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300981 # in order to take advantage of the compression and lookup
982 # algorithms used for the other characters.
983
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500984 assert pua_index < NAMED_SEQUENCES_START
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300985 pua_index = NAMED_SEQUENCES_START
986 with open_data(NAMED_SEQUENCES, version) as file:
987 for s in file:
988 s = s.strip()
989 if not s or s.startswith('#'):
990 continue
991 name, chars = s.split(';')
992 chars = tuple(int(char, 16) for char in chars.split())
993 # check that the structure defined in makeunicodename is OK
994 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
995 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
996 "the NamedSequence struct and in unicodedata_lookup")
997 self.named_sequences.append((name, chars))
998 # also store these in the PUA 1
999 self.table[pua_index][1] = name
1000 pua_index += 1
1001 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1002
Martin v. Löwis677bde22002-11-23 22:08:15 +00001003 self.exclusions = {}
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001004 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1005 for s in file:
1006 s = s.strip()
1007 if not s:
1008 continue
1009 if s[0] == '#':
1010 continue
1011 char = int(s.split()[0],16)
1012 self.exclusions[char] = 1
Martin v. Löwis677bde22002-11-23 22:08:15 +00001013
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001014 widths = [None] * 0x110000
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001015 with open_data(EASTASIAN_WIDTH, version) as file:
1016 for s in file:
1017 s = s.strip()
1018 if not s:
1019 continue
1020 if s[0] == '#':
1021 continue
1022 s = s.split()[0].split(';')
1023 if '..' in s[0]:
1024 first, last = [int(c, 16) for c in s[0].split('..')]
1025 chars = list(range(first, last+1))
1026 else:
1027 chars = [int(s[0], 16)]
1028 for char in chars:
1029 widths[char] = s[1]
1030
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00001031 for i in range(0, 0x110000):
1032 if table[i] is not None:
1033 table[i].append(widths[i])
1034
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001035 for i in range(0, 0x110000):
1036 if table[i] is not None:
1037 table[i].append(set())
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001038
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001039 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1040 for s in file:
1041 s = s.split('#', 1)[0].strip()
1042 if not s:
1043 continue
Martin v. Löwis13c3e382007-08-14 22:37:03 +00001044
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001045 r, p = s.split(";")
1046 r = r.strip()
1047 p = p.strip()
1048 if ".." in r:
1049 first, last = [int(c, 16) for c in r.split('..')]
1050 chars = list(range(first, last+1))
1051 else:
1052 chars = [int(r, 16)]
1053 for char in chars:
1054 if table[char]:
1055 # Some properties (e.g. Default_Ignorable_Code_Point)
1056 # apply to unassigned code points; ignore them
1057 table[char][-1].add(p)
1058
1059 with open_data(LINE_BREAK, version) as file:
1060 for s in file:
1061 s = s.partition('#')[0]
1062 s = [i.strip() for i in s.split(';')]
1063 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1064 continue
1065 if '..' not in s[0]:
1066 first = last = int(s[0], 16)
1067 else:
1068 first, last = [int(c, 16) for c in s[0].split('..')]
1069 for char in range(first, last+1):
1070 table[char][-1].add('Line_Break')
Florent Xicluna806d8cf2010-03-30 19:34:18 +00001071
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001072 # We only want the quickcheck properties
1073 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1074 # Yes is the default, hence only N and M occur
1075 # In 3.2.0, the format was different (NF?_NO)
1076 # The parsing will incorrectly determine these as
1077 # "yes", however, unicodedata.c will not perform quickchecks
1078 # for older versions, and no delta records will be created.
1079 quickchecks = [0] * 0x110000
1080 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001081 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1082 for s in file:
1083 if '#' in s:
1084 s = s[:s.index('#')]
1085 s = [i.strip() for i in s.split(';')]
1086 if len(s) < 2 or s[1] not in qc_order:
1087 continue
1088 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1089 quickcheck_shift = qc_order.index(s[1])*2
1090 quickcheck <<= quickcheck_shift
1091 if '..' not in s[0]:
1092 first = last = int(s[0], 16)
1093 else:
1094 first, last = [int(c, 16) for c in s[0].split('..')]
1095 for char in range(first, last+1):
1096 assert not (quickchecks[char]>>quickcheck_shift)&3
1097 quickchecks[char] |= quickcheck
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001098 for i in range(0, 0x110000):
1099 if table[i] is not None:
1100 table[i].append(quickchecks[i])
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +00001101
Ezio Melotti2a1e9262011-09-30 08:46:25 +03001102 with open_data(UNIHAN, version) as file:
1103 zip = zipfile.ZipFile(file)
1104 if version == '3.2.0':
1105 data = zip.open('Unihan-3.2.0.txt').read()
1106 else:
1107 data = zip.open('Unihan_NumericValues.txt').read()
Martin v. Löwisbaecd722010-10-11 22:42:28 +00001108 for line in data.decode("utf-8").splitlines():
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001109 if not line.startswith('U+'):
1110 continue
1111 code, tag, value = line.split(None, 3)[:3]
1112 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1113 'kOtherNumeric'):
1114 continue
1115 value = value.strip().replace(',', '')
1116 i = int(code[2:], 16)
1117 # Patch the numeric field
1118 if table[i] is not None:
1119 table[i][8] = value
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05001120 sc = self.special_casing = {}
1121 with open_data(SPECIAL_CASING, version) as file:
1122 for s in file:
1123 s = s[:-1].split('#', 1)[0]
1124 if not s:
1125 continue
1126 data = s.split("; ")
1127 if data[4]:
1128 # We ignore all conditionals (since they depend on
1129 # languages) except for one, which is hardcoded. See
1130 # handle_capital_sigma in unicodeobject.c.
1131 continue
1132 c = int(data[0], 16)
1133 lower = [int(char, 16) for char in data[1].split()]
1134 title = [int(char, 16) for char in data[2].split()]
1135 upper = [int(char, 16) for char in data[3].split()]
1136 sc[c] = (lower, title, upper)
Benjamin Petersond5890c82012-01-14 13:23:30 -05001137 cf = self.case_folding = {}
1138 if version != '3.2.0':
1139 with open_data(CASE_FOLDING, version) as file:
1140 for s in file:
1141 s = s[:-1].split('#', 1)[0]
1142 if not s:
1143 continue
1144 data = s.split("; ")
1145 if data[1] in "CF":
1146 c = int(data[0], 16)
1147 cf[c] = [int(char, 16) for char in data[2].split()]
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +00001148
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001149 def uselatin1(self):
1150 # restrict character range to ISO Latin 1
Georg Brandlbf82e372008-05-16 17:02:34 +00001151 self.chars = list(range(256))
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001152
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001153# hash table tools
1154
1155# this is a straight-forward reimplementation of Python's built-in
1156# dictionary type, using a static data structure, and a custom string
1157# hash algorithm.
1158
1159def myhash(s, magic):
1160 h = 0
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001161 for c in map(ord, s.upper()):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001162 h = (h * magic) + c
Guido van Rossumcd16bf62007-06-13 18:07:49 +00001163 ix = h & 0xff000000
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001164 if ix:
1165 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1166 return h
1167
1168SIZES = [
1169 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1170 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1171 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1172 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1173]
1174
1175class Hash:
1176 def __init__(self, name, data, magic):
1177 # turn a (key, value) list into a static hash table structure
1178
1179 # determine table size
1180 for size, poly in SIZES:
1181 if size > len(data):
1182 poly = size + poly
1183 break
1184 else:
Ezio Melotti13925002011-03-16 11:05:33 +02001185 raise AssertionError("ran out of polynomials")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001186
Collin Winter6afaeb72007-08-03 17:06:41 +00001187 print(size, "slots in hash table")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001188
1189 table = [None] * size
1190
1191 mask = size-1
1192
1193 n = 0
1194
1195 hash = myhash
1196
1197 # initialize hash table
1198 for key, value in data:
1199 h = hash(key, magic)
1200 i = (~h) & mask
1201 v = table[i]
1202 if v is None:
1203 table[i] = value
1204 continue
1205 incr = (h ^ (h >> 3)) & mask;
1206 if not incr:
1207 incr = mask
1208 while 1:
1209 n = n + 1
1210 i = (i + incr) & mask
1211 v = table[i]
1212 if v is None:
1213 table[i] = value
1214 break
1215 incr = incr << 1
1216 if incr > mask:
1217 incr = incr ^ poly
1218
Collin Winter6afaeb72007-08-03 17:06:41 +00001219 print(n, "collisions")
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001220 self.collisions = n
1221
1222 for i in range(len(table)):
1223 if table[i] is None:
1224 table[i] = 0
1225
1226 self.data = Array(name + "_hash", table)
1227 self.magic = magic
1228 self.name = name
1229 self.size = size
1230 self.poly = poly
1231
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001232 def dump(self, file, trace):
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001233 # write data to file, as a C array
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001234 self.data.dump(file, trace)
Fredrik Lundh9e9bcda2001-01-21 17:01:31 +00001235 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1236 file.write("#define %s_size %d\n" % (self.name, self.size))
1237 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1238
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001239# stuff to deal with arrays of unsigned integers
1240
1241class Array:
1242
1243 def __init__(self, name, data):
1244 self.name = name
1245 self.data = data
1246
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001247 def dump(self, file, trace=0):
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001248 # write data to file, as a C array
1249 size = getsize(self.data)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +00001250 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001251 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001252 file.write("static ")
1253 if size == 1:
1254 file.write("unsigned char")
1255 elif size == 2:
1256 file.write("unsigned short")
1257 else:
1258 file.write("unsigned int")
1259 file.write(" " + self.name + "[] = {\n")
1260 if self.data:
1261 s = " "
1262 for item in self.data:
1263 i = str(item) + ", "
1264 if len(s) + len(i) > 78:
1265 file.write(s + "\n")
1266 s = " " + i
1267 else:
1268 s = s + i
Walter Dörwaldaaab30e2002-09-11 20:36:02 +00001269 if s.strip():
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001270 file.write(s + "\n")
1271 file.write("};\n\n")
1272
1273def getsize(data):
1274 # return smallest possible integer size for the given array
1275 maxdata = max(data)
1276 if maxdata < 256:
1277 return 1
1278 elif maxdata < 65536:
1279 return 2
1280 else:
1281 return 4
1282
Tim Peters21013482000-09-25 07:13:41 +00001283def splitbins(t, trace=0):
1284 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1285
1286 t is a sequence of ints. This function can be useful to save space if
1287 many of the ints are the same. t1 and t2 are lists of ints, and shift
1288 is an int, chosen to minimize the combined size of t1 and t2 (in C
1289 code), and where for each i in range(len(t)),
1290 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1291 where mask is a bitmask isolating the last "shift" bits.
1292
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001293 If optional arg trace is non-zero (default zero), progress info
1294 is printed to sys.stderr. The higher the value, the more info
1295 you'll get.
Tim Peters21013482000-09-25 07:13:41 +00001296 """
1297
Tim Peters21013482000-09-25 07:13:41 +00001298 if trace:
1299 def dump(t1, t2, shift, bytes):
Collin Winter6afaeb72007-08-03 17:06:41 +00001300 print("%d+%d bins at shift %d; %d bytes" % (
1301 len(t1), len(t2), shift, bytes), file=sys.stderr)
1302 print("Size of original table:", len(t)*getsize(t), \
1303 "bytes", file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001304 n = len(t)-1 # last valid index
1305 maxshift = 0 # the most we can shift n and still have something left
1306 if n > 0:
1307 while n >> 1:
1308 n >>= 1
1309 maxshift += 1
1310 del n
Christian Heimesa37d4c62007-12-04 23:02:19 +00001311 bytes = sys.maxsize # smallest total size so far
Tim Peters21013482000-09-25 07:13:41 +00001312 t = tuple(t) # so slices can be dict keys
1313 for shift in range(maxshift + 1):
1314 t1 = []
1315 t2 = []
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001316 size = 2**shift
1317 bincache = {}
Tim Peters21013482000-09-25 07:13:41 +00001318 for i in range(0, len(t), size):
1319 bin = t[i:i+size]
1320 index = bincache.get(bin)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001321 if index is None:
Tim Peters21013482000-09-25 07:13:41 +00001322 index = len(t2)
1323 bincache[bin] = index
1324 t2.extend(bin)
1325 t1.append(index >> shift)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001326 # determine memory size
Tim Peters21013482000-09-25 07:13:41 +00001327 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001328 if trace > 1:
Tim Peters21013482000-09-25 07:13:41 +00001329 dump(t1, t2, shift, b)
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001330 if b < bytes:
Tim Peters21013482000-09-25 07:13:41 +00001331 best = t1, t2, shift
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001332 bytes = b
Tim Peters21013482000-09-25 07:13:41 +00001333 t1, t2, shift = best
1334 if trace:
Collin Winter6afaeb72007-08-03 17:06:41 +00001335 print("Best:", end=' ', file=sys.stderr)
Tim Peters21013482000-09-25 07:13:41 +00001336 dump(t1, t2, shift, bytes)
1337 if __debug__:
1338 # exhaustively verify that the decomposition is correct
1339 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
Guido van Rossum805365e2007-05-07 22:24:25 +00001340 for i in range(len(t)):
Tim Peters21013482000-09-25 07:13:41 +00001341 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1342 return best
Fredrik Lundhf367cac2000-09-24 23:18:31 +00001343
1344if __name__ == "__main__":
Fredrik Lundhfad27ae2000-11-03 20:24:15 +00001345 maketables(1)