Tools/unicode/mkstringprep.py - platform/external/python/cpython3 - Gitiles

 import re, unicodedata, sys

 if sys.maxunicode == 65535:
     raise RuntimeError, "need UCS-4 Python"

 def gen_category(cats):
     for i in range(0, 0x110000):
         if unicodedata.category(unichr(i)) in cats:
             yield(i)

 def gen_bidirectional(cats):
     for i in range(0, 0x110000):
         if unicodedata.bidirectional(unichr(i)) in cats:
             yield(i)

 def compact_set(l):
     single = []
     tuple = []
     prev = None
     span = 0
     for e in l:
         if prev is None:
             prev = e
             span = 0
             continue
         if prev+span+1 != e:
             if span > 2:
                 tuple.append((prev,prev+span+1))
             else:
                 for i in range(prev, prev+span+1):
                     single.append(i)
             prev = e
             span = 0
         else:
             span += 1
     if span:
         tuple.append((prev,prev+span+1))
     else:
         single.append(prev)
     tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
     if not single:
         return "set(%s)" % tuple
     if not tuple:
         return "set(%s)" % repr(single)
     return "set(%s + %s)" % (repr(single),tuple)

 ############## Read the tables in the RFC #######################

 data = open("rfc3454.txt").readlines()

 tables = []
 curname = None
 for l in data:
     l = l.strip()
     if not l:
         continue
     # Skip RFC page breaks
     if l.startswith("Hoffman & Blanchet") or\
        l.startswith("RFC 3454"):
         continue
     # Find start/end lines
     m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
     if m:
         if m.group(1) == "Start":
             if curname:
                 raise "Double Start",(curname, l)
             curname = m.group(2)
             table = {}
             tables.append((curname, table))
             continue
         else:
             if not curname:
                 raise "End without start", l
             curname = None
             continue
     if not curname:
         continue
     # Now we are in a table
     fields = l.split(";")
     if len(fields) > 1:
         # Drop comment field
         fields = fields[:-1]
     if len(fields) == 1:
         fields = fields[0].split("-")
         if len(fields) > 1:
             # range
             try:
                 start, end = fields
             except ValueError:
                 raise "Unpacking problem", l
         else:
             start = end = fields[0]
         start = int(start, 16)
         end = int(end, 16)
         for i in range(start, end+1):
             table[i] = i
     else:
         code, value = fields
         value = value.strip()
         if value:
             value = [int(v, 16) for v in value.split(" ")]
         else:
             # table B.1
             value = None
         table[int(code, 16)] = value

 ########### Generate compact Python versions of the tables #############

 print """# This file is generated by mkstringprep.py. DO NOT EDIT.
 \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.

 There are two kinds of tables: sets, for which a member test is provided,
 and mappings, for which a mapping function is provided.
 \"\"\"

 import unicodedata
 """

 print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)

 # A.1 is the table of unassigned characters
 # XXX Plane 15 PUA is listed as unassigned in Python.
 name, table = tables[0]
 del tables[0]
 assert name == "A.1"
 table = set(table.keys())
 Cn = set(gen_category(["Cn"]))

 # FDD0..FDEF are process internal codes
 Cn -= set(range(0xFDD0, 0xFDF0))
 # not a character
 Cn -= set(range(0xFFFE, 0x110000, 0x10000))
 Cn -= set(range(0xFFFF, 0x110000, 0x10000))

 # assert table == Cn

 print """
 def in_table_a1(code):
     if unicodedata.category(code) != 'Cn': return False
     c = ord(code)
     if 0xFDD0 <= c < 0xFDF0: return False
     return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
 """

 # B.1 cannot easily be derived
 name, table = tables[0]
 del tables[0]
 assert name == "B.1"
 table = table.keys()
 table.sort()
 print """
 b1_set = """ + compact_set(table) + """
 def in_table_b1(code):
     return ord(code) in b1_set
 """

 # B.2 and B.3 is case folding.
 # It takes CaseFolding.txt into account, which is
 # not available in the Python database. Since
 # B.2 is derived from B.3, we process B.3 first.
 # B.3 supposedly *is* CaseFolding-3.2.0.txt.

 name, table_b2 = tables[0]
 del tables[0]
 assert name == "B.2"

 name, table_b3 = tables[0]
 del tables[0]
 assert name == "B.3"

 # B.3 is mostly Python's .lower, except for a number
 # of special cases, e.g. considering canonical forms.

 b3_exceptions = {}

 for k,v in table_b2.items():
     if map(ord, unichr(k).lower()) != v:
         b3_exceptions[k] = u"".join(map(unichr,v))

 b3 = b3_exceptions.items()
 b3.sort()

 print """
 b3_exceptions = {"""
 for i,(k,v) in enumerate(b3):
     print "0x%x:%s," % (k, repr(v)),
     if i % 4 == 3:
         print
 print "}"

 print """
 def map_table_b3(code):
     r = b3_exceptions.get(ord(code))
     if r is not None: return r
     return code.lower()
 """

 def map_table_b3(code):
     r = b3_exceptions.get(ord(code))
     if r is not None: return r
     return code.lower()

 # B.2 is case folding for NFKC. This is the same as B.3,
 # except where NormalizeWithKC(Fold(a)) !=
 # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))

 def map_table_b2(a):
     al = map_table_b3(a)
     b = unicodedata.normalize("NFKC", al)
     bl = u"".join([map_table_b3(ch) for ch in b])
     c = unicodedata.normalize("NFKC", bl)
     if b != c:
         return c
     else:
         return al

 specials = {}
 for k,v in table_b2.items():
     if map(ord, map_table_b2(unichr(k))) != v:
         specials[k] = v

 # B.3 should not add any additional special cases
 assert specials == {}

 print """
 def map_table_b2(a):
     al = map_table_b3(a)
     b = unicodedata.normalize("NFKC", al)
     bl = u"".join([map_table_b3(ch) for ch in b])
     c = unicodedata.normalize("NFKC", bl)
     if b != c:
         return c
     else:
         return al
 """

 # C.1.1 is a table with a single character
 name, table = tables[0]
 del tables[0]
 assert name == "C.1.1"
 assert table == {0x20:0x20}

 print """
 def in_table_c11(code):
     return code == u" "
 """

 # C.1.2 is the rest of all space characters
 name, table = tables[0]
 del tables[0]
 assert name == "C.1.2"

 # table = set(table.keys())
 # Zs = set(gen_category(["Zs"])) - set([0x20])
 # assert Zs == table

 print """
 def in_table_c12(code):
     return unicodedata.category(code) == "Zs" and code != u" "

 def in_table_c11_c12(code):
     return unicodedata.category(code) == "Zs"
 """

 # C.2.1 ASCII control characters
 name, table_c21 = tables[0]
 del tables[0]
 assert name == "C.2.1"

 Cc = set(gen_category(["Cc"]))
 Cc_ascii = Cc & set(range(128))
 table_c21 = set(table_c21.keys())
 assert Cc_ascii == table_c21

 print """
 def in_table_c21(code):
     return ord(code) < 128 and unicodedata.category(code) == "Cc"
 """

 # C.2.2 Non-ASCII control characters. It also includes
 # a number of characters in category Cf.
 name, table_c22 = tables[0]
 del tables[0]
 assert name == "C.2.2"

 Cc_nonascii = Cc - Cc_ascii
 table_c22 = set(table_c22.keys())
 assert len(Cc_nonascii - table_c22) == 0

 specials = list(table_c22 - Cc_nonascii)
 specials.sort()

 print """c22_specials = """ + compact_set(specials) + """
 def in_table_c22(code):
     c = ord(code)
     if c < 128: return False
     if unicodedata.category(code) == "Cc": return True
     return c in c22_specials

 def in_table_c21_c22(code):
     return unicodedata.category(code) == "Cc" or \\
            ord(code) in c22_specials
 """

 # C.3 Private use
 name, table = tables[0]
 del tables[0]
 assert name == "C.3"

 Co = set(gen_category(["Co"]))
 assert set(table.keys()) == Co

 print """
 def in_table_c3(code):
     return unicodedata.category(code) == "Co"
 """

 # C.4 Non-character code points, xFFFE, xFFFF
 # plus process internal codes
 name, table = tables[0]
 del tables[0]
 assert name == "C.4"

 nonchar = set(range(0xFDD0,0xFDF0) +
               range(0xFFFE,0x110000,0x10000) +
               range(0xFFFF,0x110000,0x10000))
 table = set(table.keys())
 assert table == nonchar

 print """
 def in_table_c4(code):
     c = ord(code)
     if c < 0xFDD0: return False
     if c < 0xFDF0: return True
     return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
 """

 # C.5 Surrogate codes
 name, table = tables[0]
 del tables[0]
 assert name == "C.5"

 Cs = set(gen_category(["Cs"]))
 assert set(table.keys()) == Cs

 print """
 def in_table_c5(code):
     return unicodedata.category(code) == "Cs"
 """

 # C.6 Inappropriate for plain text
 name, table = tables[0]
 del tables[0]
 assert name == "C.6"

 table = table.keys()
 table.sort()

 print """
 c6_set = """ + compact_set(table) + """
 def in_table_c6(code):
     return ord(code) in c6_set
 """

 # C.7 Inappropriate for canonical representation
 name, table = tables[0]
 del tables[0]
 assert name == "C.7"

 table = table.keys()
 table.sort()

 print """
 c7_set = """ + compact_set(table) + """
 def in_table_c7(code):
     return ord(code) in c7_set
 """

 # C.8 Change display properties or are deprecated
 name, table = tables[0]
 del tables[0]
 assert name == "C.8"

 table = table.keys()
 table.sort()

 print """
 c8_set = """ + compact_set(table) + """
 def in_table_c8(code):
     return ord(code) in c8_set
 """

 # C.9 Tagging characters
 name, table = tables[0]
 del tables[0]
 assert name == "C.9"

 table = table.keys()
 table.sort()

 print """
 c9_set = """ + compact_set(table) + """
 def in_table_c9(code):
     return ord(code) in c9_set
 """

 # D.1 Characters with bidirectional property "R" or "AL"
 name, table = tables[0]
 del tables[0]
 assert name == "D.1"

 RandAL = set(gen_bidirectional(["R","AL"]))
 assert set(table.keys()) == RandAL

 print """
 def in_table_d1(code):
     return unicodedata.bidirectional(code) in ("R","AL")
 """

 # D.2 Characters with bidirectional property "L"
 name, table = tables[0]
 del tables[0]
 assert name == "D.2"

 L = set(gen_bidirectional(["L"]))
 assert set(table.keys()) == L

 print """
 def in_table_d2(code):
     return unicodedata.bidirectional(code) == "L"
 """
	import re, unicodedata, sys

	if sys.maxunicode == 65535:
	raise RuntimeError, "need UCS-4 Python"

	def gen_category(cats):
	for i in range(0, 0x110000):
	if unicodedata.category(unichr(i)) in cats:
	yield(i)

	def gen_bidirectional(cats):
	for i in range(0, 0x110000):
	if unicodedata.bidirectional(unichr(i)) in cats:
	yield(i)

	def compact_set(l):
	single = []
	tuple = []
	prev = None
	span = 0
	for e in l:
	if prev is None:
	prev = e
	span = 0
	continue
	if prev+span+1 != e:
	if span > 2:
	tuple.append((prev,prev+span+1))
	else:
	for i in range(prev, prev+span+1):
	single.append(i)
	prev = e
	span = 0
	else:
	span += 1
	if span:
	tuple.append((prev,prev+span+1))
	else:
	single.append(prev)
	tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
	if not single:
	return "set(%s)" % tuple
	if not tuple:
	return "set(%s)" % repr(single)
	return "set(%s + %s)" % (repr(single),tuple)

	############## Read the tables in the RFC #######################

	data = open("rfc3454.txt").readlines()

	tables = []
	curname = None
	for l in data:
	l = l.strip()
	if not l:
	continue
	# Skip RFC page breaks
	if l.startswith("Hoffman & Blanchet") or\
	l.startswith("RFC 3454"):
	continue
	# Find start/end lines
	m = re.match("----- (Start\|End) Table ([A-Z](.[0-9])+) -----", l)
	if m:
	if m.group(1) == "Start":
	if curname:
	raise "Double Start",(curname, l)
	curname = m.group(2)
	table = {}
	tables.append((curname, table))
	continue
	else:
	if not curname:
	raise "End without start", l
	curname = None
	continue
	if not curname:
	continue
	# Now we are in a table
	fields = l.split(";")
	if len(fields) > 1:
	# Drop comment field
	fields = fields[:-1]
	if len(fields) == 1:
	fields = fields[0].split("-")
	if len(fields) > 1:
	# range
	try:
	start, end = fields
	except ValueError:
	raise "Unpacking problem", l
	else:
	start = end = fields[0]
	start = int(start, 16)
	end = int(end, 16)
	for i in range(start, end+1):
	table[i] = i
	else:
	code, value = fields
	value = value.strip()
	if value:
	value = [int(v, 16) for v in value.split(" ")]
	else:
	# table B.1
	value = None
	table[int(code, 16)] = value

	########### Generate compact Python versions of the tables #############

	print """# This file is generated by mkstringprep.py. DO NOT EDIT.
	\"\"\"Library that exposes various tables found in the StringPrep RFC 3454.

	There are two kinds of tables: sets, for which a member test is provided,
	and mappings, for which a mapping function is provided.
	\"\"\"

	import unicodedata
	"""

	print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)

	# A.1 is the table of unassigned characters
	# XXX Plane 15 PUA is listed as unassigned in Python.
	name, table = tables[0]
	del tables[0]
	assert name == "A.1"
	table = set(table.keys())
	Cn = set(gen_category(["Cn"]))

	# FDD0..FDEF are process internal codes
	Cn -= set(range(0xFDD0, 0xFDF0))
	# not a character
	Cn -= set(range(0xFFFE, 0x110000, 0x10000))
	Cn -= set(range(0xFFFF, 0x110000, 0x10000))

	# assert table == Cn

	print """
	def in_table_a1(code):
	if unicodedata.category(code) != 'Cn': return False
	c = ord(code)
	if 0xFDD0 <= c < 0xFDF0: return False
	return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
	"""

	# B.1 cannot easily be derived
	name, table = tables[0]
	del tables[0]
	assert name == "B.1"
	table = table.keys()
	table.sort()
	print """
	b1_set = """ + compact_set(table) + """
	def in_table_b1(code):
	return ord(code) in b1_set
	"""

	# B.2 and B.3 is case folding.
	# It takes CaseFolding.txt into account, which is
	# not available in the Python database. Since
	# B.2 is derived from B.3, we process B.3 first.
	# B.3 supposedly is CaseFolding-3.2.0.txt.

	name, table_b2 = tables[0]
	del tables[0]
	assert name == "B.2"

	name, table_b3 = tables[0]
	del tables[0]
	assert name == "B.3"

	# B.3 is mostly Python's .lower, except for a number
	# of special cases, e.g. considering canonical forms.

	b3_exceptions = {}

	for k,v in table_b2.items():
	if map(ord, unichr(k).lower()) != v:
	b3_exceptions[k] = u"".join(map(unichr,v))

	b3 = b3_exceptions.items()
	b3.sort()

	print """
	b3_exceptions = {"""
	for i,(k,v) in enumerate(b3):
	print "0x%x:%s," % (k, repr(v)),
	if i % 4 == 3:
	print
	print "}"

	print """
	def map_table_b3(code):
	r = b3_exceptions.get(ord(code))
	if r is not None: return r
	return code.lower()
	"""

	def map_table_b3(code):
	r = b3_exceptions.get(ord(code))
	if r is not None: return r
	return code.lower()

	# B.2 is case folding for NFKC. This is the same as B.3,
	# except where NormalizeWithKC(Fold(a)) !=
	# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))

	def map_table_b2(a):
	al = map_table_b3(a)
	b = unicodedata.normalize("NFKC", al)
	bl = u"".join([map_table_b3(ch) for ch in b])
	c = unicodedata.normalize("NFKC", bl)
	if b != c:
	return c
	else:
	return al

	specials = {}
	for k,v in table_b2.items():
	if map(ord, map_table_b2(unichr(k))) != v:
	specials[k] = v

	# B.3 should not add any additional special cases
	assert specials == {}

	print """
	def map_table_b2(a):
	al = map_table_b3(a)
	b = unicodedata.normalize("NFKC", al)
	bl = u"".join([map_table_b3(ch) for ch in b])
	c = unicodedata.normalize("NFKC", bl)
	if b != c:
	return c
	else:
	return al
	"""

	# C.1.1 is a table with a single character
	name, table = tables[0]
	del tables[0]
	assert name == "C.1.1"
	assert table == {0x20:0x20}

	print """
	def in_table_c11(code):
	return code == u" "
	"""

	# C.1.2 is the rest of all space characters
	name, table = tables[0]
	del tables[0]
	assert name == "C.1.2"

	# table = set(table.keys())
	# Zs = set(gen_category(["Zs"])) - set([0x20])
	# assert Zs == table

	print """
	def in_table_c12(code):
	return unicodedata.category(code) == "Zs" and code != u" "

	def in_table_c11_c12(code):
	return unicodedata.category(code) == "Zs"
	"""

	# C.2.1 ASCII control characters
	name, table_c21 = tables[0]
	del tables[0]
	assert name == "C.2.1"

	Cc = set(gen_category(["Cc"]))
	Cc_ascii = Cc & set(range(128))
	table_c21 = set(table_c21.keys())
	assert Cc_ascii == table_c21

	print """
	def in_table_c21(code):
	return ord(code) < 128 and unicodedata.category(code) == "Cc"
	"""

	# C.2.2 Non-ASCII control characters. It also includes
	# a number of characters in category Cf.
	name, table_c22 = tables[0]
	del tables[0]
	assert name == "C.2.2"

	Cc_nonascii = Cc - Cc_ascii
	table_c22 = set(table_c22.keys())
	assert len(Cc_nonascii - table_c22) == 0

	specials = list(table_c22 - Cc_nonascii)
	specials.sort()

	print """c22_specials = """ + compact_set(specials) + """
	def in_table_c22(code):
	c = ord(code)
	if c < 128: return False
	if unicodedata.category(code) == "Cc": return True
	return c in c22_specials

	def in_table_c21_c22(code):
	return unicodedata.category(code) == "Cc" or \\
	ord(code) in c22_specials
	"""

	# C.3 Private use
	name, table = tables[0]
	del tables[0]
	assert name == "C.3"

	Co = set(gen_category(["Co"]))
	assert set(table.keys()) == Co

	print """
	def in_table_c3(code):
	return unicodedata.category(code) == "Co"
	"""

	# C.4 Non-character code points, xFFFE, xFFFF
	# plus process internal codes
	name, table = tables[0]
	del tables[0]
	assert name == "C.4"

	nonchar = set(range(0xFDD0,0xFDF0) +
	range(0xFFFE,0x110000,0x10000) +
	range(0xFFFF,0x110000,0x10000))
	table = set(table.keys())
	assert table == nonchar

	print """
	def in_table_c4(code):
	c = ord(code)
	if c < 0xFDD0: return False
	if c < 0xFDF0: return True
	return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
	"""

	# C.5 Surrogate codes
	name, table = tables[0]
	del tables[0]
	assert name == "C.5"

	Cs = set(gen_category(["Cs"]))
	assert set(table.keys()) == Cs

	print """
	def in_table_c5(code):
	return unicodedata.category(code) == "Cs"
	"""

	# C.6 Inappropriate for plain text
	name, table = tables[0]
	del tables[0]
	assert name == "C.6"

	table = table.keys()
	table.sort()

	print """
	c6_set = """ + compact_set(table) + """
	def in_table_c6(code):
	return ord(code) in c6_set
	"""

	# C.7 Inappropriate for canonical representation
	name, table = tables[0]
	del tables[0]
	assert name == "C.7"

	table = table.keys()
	table.sort()

	print """
	c7_set = """ + compact_set(table) + """
	def in_table_c7(code):
	return ord(code) in c7_set
	"""

	# C.8 Change display properties or are deprecated
	name, table = tables[0]
	del tables[0]
	assert name == "C.8"

	table = table.keys()
	table.sort()

	print """
	c8_set = """ + compact_set(table) + """
	def in_table_c8(code):
	return ord(code) in c8_set
	"""

	# C.9 Tagging characters
	name, table = tables[0]
	del tables[0]
	assert name == "C.9"

	table = table.keys()
	table.sort()

	print """
	c9_set = """ + compact_set(table) + """
	def in_table_c9(code):
	return ord(code) in c9_set
	"""

	# D.1 Characters with bidirectional property "R" or "AL"
	name, table = tables[0]
	del tables[0]
	assert name == "D.1"

	RandAL = set(gen_bidirectional(["R","AL"]))
	assert set(table.keys()) == RandAL

	print """
	def in_table_d1(code):
	return unicodedata.bidirectional(code) in ("R","AL")
	"""

	# D.2 Characters with bidirectional property "L"
	name, table = tables[0]
	del tables[0]
	assert name == "D.2"

	L = set(gen_bidirectional(["L"]))
	assert set(table.keys()) == L

	print """
	def in_table_d2(code):
	return unicodedata.bidirectional(code) == "L"
	"""