Blame - Tools/unicode/makeunicodedata.py - platform/external/python/cpython3

blob: 15841d7c519e9d4ef1441d23724c422787b2d8b4 [file] [log] [blame]

Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	1	#
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	2	# (re)generate unicode property and type databases
				3	#
				4	# this script converts a unicode 3.0 database file to
				5	# Modules/unicodedata_db.h and Objects/unicodetype_db.h
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	6	#
				7	# history:
				8	# 2000-09-24 fl created (based on bits and pieces from unidb)
				9	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	10	# 2000-09-25 fl added character type table
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	11	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	12	# 2000-11-03 fl expand first/last ranges
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	13	#
				14	# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	15	#
				16
				17	import sys
				18
				19	SCRIPT = sys.argv[0]
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	20	VERSION = "1.1"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	21
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	22	UNICODE_DATA = "UnicodeData-Latest.txt"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	23
				24	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
				25	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
				26	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
				27	"So" ]
				28
				29	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
				30	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
				31	"ON" ]
				32
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	33	# note: should match definitions in Objects/unicodectype.c
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	34	ALPHA_MASK = 0x01
				35	DECIMAL_MASK = 0x02
				36	DIGIT_MASK = 0x04
				37	LOWER_MASK = 0x08
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	38	LINEBREAK_MASK = 0x10
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	39	SPACE_MASK = 0x20
				40	TITLE_MASK = 0x40
				41	UPPER_MASK = 0x80
				42
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	43	def maketables(trace=0):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	44
				45	unicode = UnicodeData(UNICODE_DATA)
				46
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	47	print "--- Processing", UNICODE_DATA, "..."
				48	print len(filter(None, unicode.table)), "characters"
				49
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	50	# extract unicode properties
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	51	dummy = (0, 0, 0, 0)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	52	table = [dummy]
				53	cache = {0: dummy}
				54	index = [0] * len(unicode.chars)
				55
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	56	# 1) database properties
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	57	for char in unicode.chars:
				58	record = unicode.table[char]
				59	if record:
				60	# extract database properties
				61	category = CATEGORY_NAMES.index(record[2])
				62	combining = int(record[3])
				63	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
				64	mirrored = record[9] == "Y"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	65	item = (
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	66	category, combining, bidirectional, mirrored
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	67	)
				68	# add entry to index and item tables
				69	i = cache.get(item)
				70	if i is None:
				71	cache[item] = i = len(table)
				72	table.append(item)
				73	index[char] = i
				74
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	75	# 2) decomposition data
				76
				77	# FIXME: <fl> using the encoding stuff from unidb would save
				78	# another 50k or so, but I'll leave that for 2.1...
				79
				80	decomp_data = [""]
				81	decomp_index = [0] * len(unicode.chars)
				82
				83	for char in unicode.chars:
				84	record = unicode.table[char]
				85	if record:
				86	if record[5]:
				87	try:
				88	i = decomp_data.index(record[5])
				89	except ValueError:
				90	i = len(decomp_data)
				91	decomp_data.append(record[5])
				92	else:
				93	i = 0
				94	decomp_index[char] = i
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	95
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	96	FILE = "Modules/unicodedata_db.h"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	97
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	98	print "--- Writing", FILE, "..."
				99
				100	print len(table), "unique properties"
				101	print len(decomp_data), "unique decomposition entries"
				102
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	103	fp = open(FILE, "w")
				104	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				105	print >>fp
				106	print >>fp, "/* a list of unique database records */"
				107	print >>fp, \
				108	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	109	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	110	print >>fp, " {%d, %d, %d, %d}," % item
				111	print >>fp, "};"
				112	print >>fp
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	113
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	114	# FIXME: the following tables should be made static, and
				115	# the support code moved into unicodedatabase.c
				116
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	117	print >>fp, "/* string literals */"
				118	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	119	for name in CATEGORY_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	120	print >>fp, " \"%s\"," % name
				121	print >>fp, " NULL"
				122	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	123
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	124	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	125	for name in BIDIRECTIONAL_NAMES:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	126	print >>fp, " \"%s\"," % name
				127	print >>fp, " NULL"
				128	print >>fp, "};"
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	129
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	130	print >>fp, "static const char *decomp_data[] = {"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	131	for name in decomp_data:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	132	print >>fp, " \"%s\"," % name
				133	print >>fp, " NULL"
				134	print >>fp, "};"
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	135
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	136	# split record index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	137	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	138
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	139	print >>fp, "/* index tables for the database records */"
				140	print >>fp, "#define SHIFT", shift
				141	Array("index1", index1).dump(fp)
				142	Array("index2", index2).dump(fp)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	143
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	144	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	145	index1, index2, shift = splitbins(decomp_index, trace)
Fredrik Lundh	cfcea49	2000-09-25 08:07:06 +0000	[diff] [blame]	146
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	147	print >>fp, "/* index tables for the decomposition data */"
				148	print >>fp, "#define DECOMP_SHIFT", shift
				149	Array("decomp_index1", index1).dump(fp)
				150	Array("decomp_index2", index2).dump(fp)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	151
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	152	#
				153	# 3) unicode type data
				154
				155	# extract unicode types
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	156	dummy = (0, 0, 0, 0, 0, 0)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	157	table = [dummy]
				158	cache = {0: dummy}
				159	index = [0] * len(unicode.chars)
				160
				161	for char in unicode.chars:
				162	record = unicode.table[char]
				163	if record:
				164	# extract database properties
				165	category = record[2]
				166	bidirectional = record[4]
				167	flags = 0
				168	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
				169	flags \|= ALPHA_MASK
				170	if category == "Ll":
				171	flags \|= LOWER_MASK
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	172	if category == "Zl" or bidirectional == "B":
				173	flags \|= LINEBREAK_MASK
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	174	if category == "Zs" or bidirectional in ("WS", "B", "S"):
				175	flags \|= SPACE_MASK
Fredrik Lundh	375732c	2000-09-25 23:03:34 +0000	[diff] [blame]	176	if category == "Lt":
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	177	flags \|= TITLE_MASK
				178	if category == "Lu":
				179	flags \|= UPPER_MASK
				180	# use delta predictor for upper/lower/title
				181	if record[12]:
				182	upper = (int(record[12], 16) - char) & 0xffff
				183	else:
				184	upper = 0
				185	if record[13]:
				186	lower = (int(record[13], 16) - char) & 0xffff
				187	else:
				188	lower = 0
				189	if record[14]:
				190	title = (int(record[14], 16) - char) & 0xffff
				191	else:
				192	title = 0
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	193	# decimal digit, integer digit
				194	decimal = 0
				195	if record[6]:
				196	flags \|= DECIMAL_MASK
				197	decimal = int(record[6])
				198	digit = 0
				199	if record[7]:
				200	flags \|= DIGIT_MASK
				201	digit = int(record[7])
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	202	item = (
Fredrik Lundh	0f8fad4	2000-09-25 21:01:56 +0000	[diff] [blame]	203	flags, upper, lower, title, decimal, digit
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	204	)
				205	# add entry to index and item tables
				206	i = cache.get(item)
				207	if i is None:
				208	cache[item] = i = len(table)
				209	table.append(item)
				210	index[char] = i
				211
				212	FILE = "Objects/unicodetype_db.h"
				213
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	214	fp = open(FILE, "w")
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	215
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	216	print "--- Writing", FILE, "..."
				217
				218	print len(table), "unique character type entries"
				219
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	220	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
				221	print >>fp
				222	print >>fp, "/* a list of unique character type descriptors */"
				223	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	224	for item in table:
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	225	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
				226	print >>fp, "};"
				227	print >>fp
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	228
				229	# split decomposition index table
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	230	index1, index2, shift = splitbins(index, trace)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	231
Fred Drake	9c68505	2000-10-26 03:56:46 +0000	[diff] [blame]	232	print >>fp, "/* type indexes */"
				233	print >>fp, "#define SHIFT", shift
				234	Array("index1", index1).dump(fp)
				235	Array("index2", index2).dump(fp)
Fredrik Lundh	e9133f7	2000-09-25 17:59:57 +0000	[diff] [blame]	236
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	237	# --------------------------------------------------------------------
				238	# the following support code is taken from the unidb utilities
				239	# Copyright (c) 1999-2000 by Secret Labs AB
				240
				241	# load a unicode-data file from disk
				242
				243	import string, sys
				244
				245	class UnicodeData:
				246
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	247	def __init__(self, filename, expand=1):
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	248	file = open(filename)
				249	table = [None] * 65536
				250	while 1:
				251	s = file.readline()
				252	if not s:
				253	break
				254	s = string.split(string.strip(s), ";")
				255	char = string.atoi(s[0], 16)
				256	table[char] = s
				257
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	258	# expand first-last ranges (ignore surrogates and private use)
				259	if expand:
				260	field = None
				261	for i in range(0, 0xD800):
				262	s = table[i]
				263	if s:
				264	if s[1][-6:] == "First>":
				265	s[1] = ""
				266	field = s[:]
				267	elif s[1][-5:] == "Last>":
				268	s[1] = ""
				269	field = None
				270	elif field:
				271	field[0] = hex(i)
				272	table[i] = field
				273
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	274	# public attributes
				275	self.filename = filename
				276	self.table = table
				277	self.chars = range(65536) # unicode
				278
				279	def uselatin1(self):
				280	# restrict character range to ISO Latin 1
				281	self.chars = range(256)
				282
				283	# stuff to deal with arrays of unsigned integers
				284
				285	class Array:
				286
				287	def __init__(self, name, data):
				288	self.name = name
				289	self.data = data
				290
				291	def dump(self, file):
				292	# write data to file, as a C array
				293	size = getsize(self.data)
				294	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
				295	file.write("static ")
				296	if size == 1:
				297	file.write("unsigned char")
				298	elif size == 2:
				299	file.write("unsigned short")
				300	else:
				301	file.write("unsigned int")
				302	file.write(" " + self.name + "[] = {\n")
				303	if self.data:
				304	s = " "
				305	for item in self.data:
				306	i = str(item) + ", "
				307	if len(s) + len(i) > 78:
				308	file.write(s + "\n")
				309	s = " " + i
				310	else:
				311	s = s + i
				312	if string.strip(s):
				313	file.write(s + "\n")
				314	file.write("};\n\n")
				315
				316	def getsize(data):
				317	# return smallest possible integer size for the given array
				318	maxdata = max(data)
				319	if maxdata < 256:
				320	return 1
				321	elif maxdata < 65536:
				322	return 2
				323	else:
				324	return 4
				325
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	326	def splitbins(t, trace=0):
				327	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
				328
				329	t is a sequence of ints. This function can be useful to save space if
				330	many of the ints are the same. t1 and t2 are lists of ints, and shift
				331	is an int, chosen to minimize the combined size of t1 and t2 (in C
				332	code), and where for each i in range(len(t)),
				333	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				334	where mask is a bitmask isolating the last "shift" bits.
				335
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	336	If optional arg trace is non-zero (default zero), progress info
				337	is printed to sys.stderr. The higher the value, the more info
				338	you'll get.
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	339	"""
				340
				341	import sys
				342	if trace:
				343	def dump(t1, t2, shift, bytes):
				344	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
				345	len(t1), len(t2), shift, bytes)
				346	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
				347	"bytes"
				348	n = len(t)-1 # last valid index
				349	maxshift = 0 # the most we can shift n and still have something left
				350	if n > 0:
				351	while n >> 1:
				352	n >>= 1
				353	maxshift += 1
				354	del n
				355	bytes = sys.maxint # smallest total size so far
				356	t = tuple(t) # so slices can be dict keys
				357	for shift in range(maxshift + 1):
				358	t1 = []
				359	t2 = []
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	360	size = 2**shift
				361	bincache = {}
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	362	for i in range(0, len(t), size):
				363	bin = t[i:i+size]
				364	index = bincache.get(bin)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	365	if index is None:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	366	index = len(t2)
				367	bincache[bin] = index
				368	t2.extend(bin)
				369	t1.append(index >> shift)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	370	# determine memory size
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	371	b = len(t1)getsize(t1) + len(t2)getsize(t2)
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	372	if trace > 1:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	373	dump(t1, t2, shift, b)
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	374	if b < bytes:
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	375	best = t1, t2, shift
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	376	bytes = b
Tim Peters	2101348	2000-09-25 07:13:41 +0000	[diff] [blame]	377	t1, t2, shift = best
				378	if trace:
				379	print >>sys.stderr, "Best:",
				380	dump(t1, t2, shift, bytes)
				381	if __debug__:
				382	# exhaustively verify that the decomposition is correct
				383	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
				384	for i in xrange(len(t)):
				385	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
				386	return best
Fredrik Lundh	f367cac	2000-09-24 23:18:31 +0000	[diff] [blame]	387
				388	if __name__ == "__main__":
Fredrik Lundh	fad27ae	2000-11-03 20:24:15 +0000	[diff] [blame^]	389	maketables(1)