| import re, sys |
| from unicodedata import ucd_3_2_0 as unicodedata |
| |
| if sys.maxunicode == 65535: |
| raise RuntimeError("need UCS-4 Python") |
| |
| def gen_category(cats): |
| for i in range(0, 0x110000): |
| if unicodedata.category(chr(i)) in cats: |
| yield(i) |
| |
| def gen_bidirectional(cats): |
| for i in range(0, 0x110000): |
| if unicodedata.bidirectional(chr(i)) in cats: |
| yield(i) |
| |
| def compact_set(l): |
| single = [] |
| tuple = [] |
| prev = None |
| span = 0 |
| for e in l: |
| if prev is None: |
| prev = e |
| span = 0 |
| continue |
| if prev+span+1 != e: |
| if span > 2: |
| tuple.append((prev,prev+span+1)) |
| else: |
| for i in range(prev, prev+span+1): |
| single.append(i) |
| prev = e |
| span = 0 |
| else: |
| span += 1 |
| if span: |
| tuple.append((prev,prev+span+1)) |
| else: |
| single.append(prev) |
| if not single and len(tuple) == 1: |
| tuple = "range(%d,%d)" % tuple[0] |
| else: |
| tuple = " + ".join("list(range(%d,%d))" % t for t in tuple) |
| if not single: |
| return "set(%s)" % tuple |
| if not tuple: |
| return "set(%r)" % (single,) |
| return "set(%r + %s)" % (single, tuple) |
| |
| ############## Read the tables in the RFC ####################### |
| |
| with open("rfc3454.txt") as f: |
| data = f.readlines() |
| |
| tables = [] |
| curname = None |
| for l in data: |
| l = l.strip() |
| if not l: |
| continue |
| # Skip RFC page breaks |
| if l.startswith(("Hoffman & Blanchet", "RFC 3454")): |
| continue |
| # Find start/end lines |
| m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l) |
| if m: |
| if m.group(1) == "Start": |
| if curname: |
| raise RuntimeError("Double Start", (curname, l)) |
| curname = m.group(2) |
| table = {} |
| tables.append((curname, table)) |
| continue |
| else: |
| if not curname: |
| raise RuntimeError("End without start", l) |
| if curname != m.group(2): |
| raise RuntimeError("Unexpected end", l) |
| curname = None |
| continue |
| if not curname: |
| continue |
| # Now we are in a table |
| fields = l.split(";") |
| if len(fields) > 1: |
| # Drop comment field |
| fields = fields[:-1] |
| if len(fields) == 1: |
| fields = fields[0].split("-") |
| if len(fields) > 1: |
| # range |
| try: |
| start, end = fields |
| except ValueError: |
| raise RuntimeError("Unpacking problem", l) |
| else: |
| start = end = fields[0] |
| start = int(start, 16) |
| end = int(end, 16) |
| for i in range(start, end+1): |
| table[i] = i |
| else: |
| code, value = fields |
| value = value.strip() |
| if value: |
| value = [int(v, 16) for v in value.split(" ")] |
| else: |
| # table B.1 |
| value = None |
| table[int(code, 16)] = value |
| |
| ########### Generate compact Python versions of the tables ############# |
| |
| print("""# This file is generated by mkstringprep.py. DO NOT EDIT. |
| \"\"\"Library that exposes various tables found in the StringPrep RFC 3454. |
| |
| There are two kinds of tables: sets, for which a member test is provided, |
| and mappings, for which a mapping function is provided. |
| \"\"\" |
| |
| from unicodedata import ucd_3_2_0 as unicodedata |
| """) |
| |
| print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,)) |
| |
| # A.1 is the table of unassigned characters |
| # XXX Plane 15 PUA is listed as unassigned in Python. |
| name, table = tables[0] |
| del tables[0] |
| assert name == "A.1" |
| table = set(table.keys()) |
| Cn = set(gen_category(["Cn"])) |
| |
| # FDD0..FDEF are process internal codes |
| Cn -= set(range(0xFDD0, 0xFDF0)) |
| # not a character |
| Cn -= set(range(0xFFFE, 0x110000, 0x10000)) |
| Cn -= set(range(0xFFFF, 0x110000, 0x10000)) |
| |
| # assert table == Cn |
| |
| print(""" |
| def in_table_a1(code): |
| if unicodedata.category(code) != 'Cn': return False |
| c = ord(code) |
| if 0xFDD0 <= c < 0xFDF0: return False |
| return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) |
| """) |
| |
| # B.1 cannot easily be derived |
| name, table = tables[0] |
| del tables[0] |
| assert name == "B.1" |
| table = sorted(table.keys()) |
| print(""" |
| b1_set = """ + compact_set(table) + """ |
| def in_table_b1(code): |
| return ord(code) in b1_set |
| """) |
| |
| # B.2 and B.3 is case folding. |
| # It takes CaseFolding.txt into account, which is |
| # not available in the Python database. Since |
| # B.2 is derived from B.3, we process B.3 first. |
| # B.3 supposedly *is* CaseFolding-3.2.0.txt. |
| |
| name, table_b2 = tables[0] |
| del tables[0] |
| assert name == "B.2" |
| |
| name, table_b3 = tables[0] |
| del tables[0] |
| assert name == "B.3" |
| |
| # B.3 is mostly Python's .lower, except for a number |
| # of special cases, e.g. considering canonical forms. |
| |
| b3_exceptions = {} |
| |
| for k,v in table_b2.items(): |
| if list(map(ord, chr(k).lower())) != v: |
| b3_exceptions[k] = "".join(map(chr,v)) |
| |
| b3 = sorted(b3_exceptions.items()) |
| |
| print(""" |
| b3_exceptions = {""") |
| for i, kv in enumerate(b3): |
| print("0x%x:%a," % kv, end=' ') |
| if i % 4 == 3: |
| print() |
| print("}") |
| |
| print(""" |
| def map_table_b3(code): |
| r = b3_exceptions.get(ord(code)) |
| if r is not None: return r |
| return code.lower() |
| """) |
| |
| def map_table_b3(code): |
| r = b3_exceptions.get(ord(code)) |
| if r is not None: return r |
| return code.lower() |
| |
| # B.2 is case folding for NFKC. This is the same as B.3, |
| # except where NormalizeWithKC(Fold(a)) != |
| # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a)))) |
| |
| def map_table_b2(a): |
| al = map_table_b3(a) |
| b = unicodedata.normalize("NFKC", al) |
| bl = "".join([map_table_b3(ch) for ch in b]) |
| c = unicodedata.normalize("NFKC", bl) |
| if b != c: |
| return c |
| else: |
| return al |
| |
| specials = {} |
| for k,v in table_b2.items(): |
| if list(map(ord, map_table_b2(chr(k)))) != v: |
| specials[k] = v |
| |
| # B.3 should not add any additional special cases |
| assert specials == {} |
| |
| print(""" |
| def map_table_b2(a): |
| al = map_table_b3(a) |
| b = unicodedata.normalize("NFKC", al) |
| bl = "".join([map_table_b3(ch) for ch in b]) |
| c = unicodedata.normalize("NFKC", bl) |
| if b != c: |
| return c |
| else: |
| return al |
| """) |
| |
| # C.1.1 is a table with a single character |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.1.1" |
| assert table == {0x20:0x20} |
| |
| print(""" |
| def in_table_c11(code): |
| return code == " " |
| """) |
| |
| # C.1.2 is the rest of all space characters |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.1.2" |
| |
| # table = set(table.keys()) |
| # Zs = set(gen_category(["Zs"])) - {0x20} |
| # assert Zs == table |
| |
| print(""" |
| def in_table_c12(code): |
| return unicodedata.category(code) == "Zs" and code != " " |
| |
| def in_table_c11_c12(code): |
| return unicodedata.category(code) == "Zs" |
| """) |
| |
| # C.2.1 ASCII control characters |
| name, table_c21 = tables[0] |
| del tables[0] |
| assert name == "C.2.1" |
| |
| Cc = set(gen_category(["Cc"])) |
| Cc_ascii = Cc & set(range(128)) |
| table_c21 = set(table_c21.keys()) |
| assert Cc_ascii == table_c21 |
| |
| print(""" |
| def in_table_c21(code): |
| return ord(code) < 128 and unicodedata.category(code) == "Cc" |
| """) |
| |
| # C.2.2 Non-ASCII control characters. It also includes |
| # a number of characters in category Cf. |
| name, table_c22 = tables[0] |
| del tables[0] |
| assert name == "C.2.2" |
| |
| Cc_nonascii = Cc - Cc_ascii |
| table_c22 = set(table_c22.keys()) |
| assert len(Cc_nonascii - table_c22) == 0 |
| |
| specials = list(table_c22 - Cc_nonascii) |
| specials.sort() |
| |
| print("""c22_specials = """ + compact_set(specials) + """ |
| def in_table_c22(code): |
| c = ord(code) |
| if c < 128: return False |
| if unicodedata.category(code) == "Cc": return True |
| return c in c22_specials |
| |
| def in_table_c21_c22(code): |
| return unicodedata.category(code) == "Cc" or \\ |
| ord(code) in c22_specials |
| """) |
| |
| # C.3 Private use |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.3" |
| |
| Co = set(gen_category(["Co"])) |
| assert set(table.keys()) == Co |
| |
| print(""" |
| def in_table_c3(code): |
| return unicodedata.category(code) == "Co" |
| """) |
| |
| # C.4 Non-character code points, xFFFE, xFFFF |
| # plus process internal codes |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.4" |
| |
| nonchar = set(range(0xFDD0,0xFDF0)) |
| nonchar.update(range(0xFFFE,0x110000,0x10000)) |
| nonchar.update(range(0xFFFF,0x110000,0x10000)) |
| table = set(table.keys()) |
| assert table == nonchar |
| |
| print(""" |
| def in_table_c4(code): |
| c = ord(code) |
| if c < 0xFDD0: return False |
| if c < 0xFDF0: return True |
| return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) |
| """) |
| |
| # C.5 Surrogate codes |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.5" |
| |
| Cs = set(gen_category(["Cs"])) |
| assert set(table.keys()) == Cs |
| |
| print(""" |
| def in_table_c5(code): |
| return unicodedata.category(code) == "Cs" |
| """) |
| |
| # C.6 Inappropriate for plain text |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.6" |
| |
| table = sorted(table.keys()) |
| |
| print(""" |
| c6_set = """ + compact_set(table) + """ |
| def in_table_c6(code): |
| return ord(code) in c6_set |
| """) |
| |
| # C.7 Inappropriate for canonical representation |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.7" |
| |
| table = sorted(table.keys()) |
| |
| print(""" |
| c7_set = """ + compact_set(table) + """ |
| def in_table_c7(code): |
| return ord(code) in c7_set |
| """) |
| |
| # C.8 Change display properties or are deprecated |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.8" |
| |
| table = sorted(table.keys()) |
| |
| print(""" |
| c8_set = """ + compact_set(table) + """ |
| def in_table_c8(code): |
| return ord(code) in c8_set |
| """) |
| |
| # C.9 Tagging characters |
| name, table = tables[0] |
| del tables[0] |
| assert name == "C.9" |
| |
| table = sorted(table.keys()) |
| |
| print(""" |
| c9_set = """ + compact_set(table) + """ |
| def in_table_c9(code): |
| return ord(code) in c9_set |
| """) |
| |
| # D.1 Characters with bidirectional property "R" or "AL" |
| name, table = tables[0] |
| del tables[0] |
| assert name == "D.1" |
| |
| RandAL = set(gen_bidirectional(["R","AL"])) |
| assert set(table.keys()) == RandAL |
| |
| print(""" |
| def in_table_d1(code): |
| return unicodedata.bidirectional(code) in ("R","AL") |
| """) |
| |
| # D.2 Characters with bidirectional property "L" |
| name, table = tables[0] |
| del tables[0] |
| assert name == "D.2" |
| |
| L = set(gen_bidirectional(["L"])) |
| assert set(table.keys()) == L |
| |
| print(""" |
| def in_table_d2(code): |
| return unicodedata.bidirectional(code) == "L" |
| """) |