Tools/unicode/makeunicodedata.py - platform/external/python/cpython2 - Gitiles

 #
 # makeunidb.py -- generate a compact version of the unicode property
 # database (unicodedatabase.h)
 #

 import sys

 SCRIPT = sys.argv[0]
 VERSION = "1.0"

 UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"

 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
     "So" ]

 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
     "ON" ]

 def maketable():

     unicode = UnicodeData(UNICODE_DATA)

     # extract unicode properties
     dummy = (0, 0, 0, 0, "NULL")
     table = [dummy]
     cache = {0: dummy}
     index = [0] * len(unicode.chars)

     DECOMPOSITION = [""]

     for char in unicode.chars:
         record = unicode.table[char]
         if record:
             # extract database properties
             category = CATEGORY_NAMES.index(record[2])
             combining = int(record[3])
             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
             mirrored = record[9] == "Y"
             if record[5]:
                 decomposition = '"%s"' % record[5]
             else:
                 decomposition = "NULL"
             item = (
                 category, combining, bidirectional, mirrored, decomposition
                 )
             # add entry to index and item tables
             i = cache.get(item)
             if i is None:
                 cache[item] = i = len(table)
                 table.append(item)
             index[char] = i

     # FIXME: we really should compress the decomposition stuff
     # (see the unidb utilities for one way to do this)

     FILE = "unicodedata_db.h"

     sys.stdout = open(FILE, "w")

     print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
     print
     print "/* a list of unique database records */"
     print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
     for item in table:
         print "    {%d, %d, %d, %d, %s}," % item
     print "};"
     print

     print "/* string literals */"
     print "const char *_PyUnicode_CategoryNames[] = {"
     for name in CATEGORY_NAMES:
         print "    \"%s\"," % name
     print "    NULL"
     print "};"

     print "const char *_PyUnicode_BidirectionalNames[] = {"
     for name in BIDIRECTIONAL_NAMES:
         print "    \"%s\"," % name
     print "    NULL"
     print "};"

     # split index table
     index1, index2, shift = splitbins(index)

     print "/* index tables used to find the right database record */"
     print "#define SHIFT", shift
     Array("index1", index1).dump(sys.stdout)
     Array("index2", index2).dump(sys.stdout)

     sys.stdout = sys.__stdout__

 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB

 # load a unicode-data file from disk

 import string, sys

 class UnicodeData:

     def __init__(self, filename):
         file = open(filename)
         table = [None] * 65536
         while 1:
             s = file.readline()
             if not s:
                 break
             s = string.split(string.strip(s), ";")
             char = string.atoi(s[0], 16)
             table[char] = s

         # public attributes
         self.filename = filename
         self.table = table
         self.chars = range(65536) # unicode

     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = range(256)

 # stuff to deal with arrays of unsigned integers

 class Array:

     def __init__(self, name, data):
         self.name = name
         self.data = data

     def dump(self, file):
         # write data to file, as a C array
         size = getsize(self.data)
         # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
         file.write("static ")
         if size == 1:
             file.write("unsigned char")
         elif size == 2:
             file.write("unsigned short")
         else:
             file.write("unsigned int")
         file.write(" " + self.name + "[] = {\n")
         if self.data:
             s = "    "
             for item in self.data:
                 i = str(item) + ", "
                 if len(s) + len(i) > 78:
                     file.write(s + "\n")
                     s = "    " + i
                 else:
                     s = s + i
             if string.strip(s):
                 file.write(s + "\n")
         file.write("};\n\n")

 def getsize(data):
     # return smallest possible integer size for the given array
     maxdata = max(data)
     if maxdata < 256:
         return 1
     elif maxdata < 65536:
         return 2
     else:
         return 4

 def splitbins(bins):
     # split a sparse integer table into two tables, such as:
     #   value = t2[(t1[char>>shift]<<shift)+(char&mask)]
     # and value == 0 means no data
     bytes = sys.maxint
     for shift in range(16):
         bin1 = []
         bin2 = []
         size = 2**shift
         bincache = {}
         for i in range(0, len(bins), size):
             bin = bins[i:i+size]
             index = bincache.get(tuple(bin))
             if index is None:
                 index = len(bin2)
                 bincache[tuple(bin)] = index
                 for v in bin:
                     if v is None:
                         bin2.append(0)
                     else:
                         bin2.append(v)
             bin1.append(index>>shift)
         # determine memory size
         b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
         if b < bytes:
             best = shift, bin1, bin2
             bytes = b
     shift, bin1, bin2 = best
 ##     print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
 ##         len(bin1), len(bin2), shift, bytes
 ##         )
     return bin1, bin2, shift

 if __name__ == "__main__":
     maketable()
	#
	# makeunidb.py -- generate a compact version of the unicode property
	# database (unicodedatabase.h)
	#

	import sys

	SCRIPT = sys.argv[0]
	VERSION = "1.0"

	UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"

	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
	"So" ]

	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
	"ON" ]

	def maketable():

	unicode = UnicodeData(UNICODE_DATA)

	# extract unicode properties
	dummy = (0, 0, 0, 0, "NULL")
	table = [dummy]
	cache = {0: dummy}
	index = [0] * len(unicode.chars)

	DECOMPOSITION = [""]

	for char in unicode.chars:
	record = unicode.table[char]
	if record:
	# extract database properties
	category = CATEGORY_NAMES.index(record[2])
	combining = int(record[3])
	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
	mirrored = record[9] == "Y"
	if record[5]:
	decomposition = '"%s"' % record[5]
	else:
	decomposition = "NULL"
	item = (
	category, combining, bidirectional, mirrored, decomposition
	)
	# add entry to index and item tables
	i = cache.get(item)
	if i is None:
	cache[item] = i = len(table)
	table.append(item)
	index[char] = i

	# FIXME: we really should compress the decomposition stuff
	# (see the unidb utilities for one way to do this)

	FILE = "unicodedata_db.h"

	sys.stdout = open(FILE, "w")

	print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
	print
	print "/* a list of unique database records */"
	print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
	for item in table:
	print " {%d, %d, %d, %d, %s}," % item
	print "};"
	print

	print "/* string literals */"
	print "const char *_PyUnicode_CategoryNames[] = {"
	for name in CATEGORY_NAMES:
	print " \"%s\"," % name
	print " NULL"
	print "};"

	print "const char *_PyUnicode_BidirectionalNames[] = {"
	for name in BIDIRECTIONAL_NAMES:
	print " \"%s\"," % name
	print " NULL"
	print "};"

	# split index table
	index1, index2, shift = splitbins(index)

	print "/* index tables used to find the right database record */"
	print "#define SHIFT", shift
	Array("index1", index1).dump(sys.stdout)
	Array("index2", index2).dump(sys.stdout)

	sys.stdout = sys.__stdout__

	# --------------------------------------------------------------------
	# the following support code is taken from the unidb utilities
	# Copyright (c) 1999-2000 by Secret Labs AB

	# load a unicode-data file from disk

	import string, sys

	class UnicodeData:

	def __init__(self, filename):
	file = open(filename)
	table = [None] * 65536
	while 1:
	s = file.readline()
	if not s:
	break
	s = string.split(string.strip(s), ";")
	char = string.atoi(s[0], 16)
	table[char] = s

	# public attributes
	self.filename = filename
	self.table = table
	self.chars = range(65536) # unicode

	def uselatin1(self):
	# restrict character range to ISO Latin 1
	self.chars = range(256)

	# stuff to deal with arrays of unsigned integers

	class Array:

	def __init__(self, name, data):
	self.name = name
	self.data = data

	def dump(self, file):
	# write data to file, as a C array
	size = getsize(self.data)
	# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
	file.write("static ")
	if size == 1:
	file.write("unsigned char")
	elif size == 2:
	file.write("unsigned short")
	else:
	file.write("unsigned int")
	file.write(" " + self.name + "[] = {\n")
	if self.data:
	s = " "
	for item in self.data:
	i = str(item) + ", "
	if len(s) + len(i) > 78:
	file.write(s + "\n")
	s = " " + i
	else:
	s = s + i
	if string.strip(s):
	file.write(s + "\n")
	file.write("};\n\n")

	def getsize(data):
	# return smallest possible integer size for the given array
	maxdata = max(data)
	if maxdata < 256:
	return 1
	elif maxdata < 65536:
	return 2
	else:
	return 4

	def splitbins(bins):
	# split a sparse integer table into two tables, such as:
	# value = t2[(t1[char>>shift]<<shift)+(char&mask)]
	# and value == 0 means no data
	bytes = sys.maxint
	for shift in range(16):
	bin1 = []
	bin2 = []
	size = 2**shift
	bincache = {}
	for i in range(0, len(bins), size):
	bin = bins[i:i+size]
	index = bincache.get(tuple(bin))
	if index is None:
	index = len(bin2)
	bincache[tuple(bin)] = index
	for v in bin:
	if v is None:
	bin2.append(0)
	else:
	bin2.append(v)
	bin1.append(index>>shift)
	# determine memory size
	b = len(bin1)getsize(bin1) + len(bin2)getsize(bin2)
	if b < bytes:
	best = shift, bin1, bin2
	bytes = b
	shift, bin1, bin2 = best
	## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
	## len(bin1), len(bin2), shift, bytes
	## )
	return bin1, bin2, shift

	if __name__ == "__main__":
	maketable()