Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 1 | import re, sys |
| 2 | from unicodedata import ucd_3_2_0 as unicodedata |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 3 | |
| 4 | if sys.maxunicode == 65535: |
Collin Winter | a817e58 | 2007-08-22 23:05:06 +0000 | [diff] [blame] | 5 | raise RuntimeError("need UCS-4 Python") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 6 | |
| 7 | def gen_category(cats): |
| 8 | for i in range(0, 0x110000): |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 9 | if unicodedata.category(chr(i)) in cats: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 10 | yield(i) |
| 11 | |
| 12 | def gen_bidirectional(cats): |
| 13 | for i in range(0, 0x110000): |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 14 | if unicodedata.bidirectional(chr(i)) in cats: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 15 | yield(i) |
| 16 | |
| 17 | def compact_set(l): |
| 18 | single = [] |
| 19 | tuple = [] |
| 20 | prev = None |
| 21 | span = 0 |
| 22 | for e in l: |
| 23 | if prev is None: |
| 24 | prev = e |
| 25 | span = 0 |
| 26 | continue |
| 27 | if prev+span+1 != e: |
| 28 | if span > 2: |
| 29 | tuple.append((prev,prev+span+1)) |
| 30 | else: |
| 31 | for i in range(prev, prev+span+1): |
| 32 | single.append(i) |
| 33 | prev = e |
| 34 | span = 0 |
| 35 | else: |
| 36 | span += 1 |
| 37 | if span: |
| 38 | tuple.append((prev,prev+span+1)) |
| 39 | else: |
| 40 | single.append(prev) |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 41 | if not single and len(tuple) == 1: |
| 42 | tuple = "range(%d,%d)" % tuple[0] |
| 43 | else: |
| 44 | tuple = " + ".join("list(range(%d,%d))" % t for t in tuple) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 45 | if not single: |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 46 | return "set(%s)" % tuple |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 47 | if not tuple: |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 48 | return "set(%r)" % (single,) |
| 49 | return "set(%r + %s)" % (single, tuple) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 50 | |
| 51 | ############## Read the tables in the RFC ####################### |
| 52 | |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 53 | with open("rfc3454.txt") as f: |
| 54 | data = f.readlines() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 55 | |
| 56 | tables = [] |
| 57 | curname = None |
| 58 | for l in data: |
| 59 | l = l.strip() |
| 60 | if not l: |
| 61 | continue |
| 62 | # Skip RFC page breaks |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 63 | if l.startswith(("Hoffman & Blanchet", "RFC 3454")): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 64 | continue |
| 65 | # Find start/end lines |
| 66 | m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l) |
| 67 | if m: |
| 68 | if m.group(1) == "Start": |
| 69 | if curname: |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 70 | raise RuntimeError("Double Start", (curname, l)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 71 | curname = m.group(2) |
| 72 | table = {} |
| 73 | tables.append((curname, table)) |
| 74 | continue |
| 75 | else: |
| 76 | if not curname: |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 77 | raise RuntimeError("End without start", l) |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 78 | if curname != m.group(2): |
| 79 | raise RuntimeError("Unexpected end", l) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 80 | curname = None |
| 81 | continue |
| 82 | if not curname: |
| 83 | continue |
| 84 | # Now we are in a table |
| 85 | fields = l.split(";") |
| 86 | if len(fields) > 1: |
| 87 | # Drop comment field |
| 88 | fields = fields[:-1] |
| 89 | if len(fields) == 1: |
| 90 | fields = fields[0].split("-") |
| 91 | if len(fields) > 1: |
| 92 | # range |
| 93 | try: |
| 94 | start, end = fields |
| 95 | except ValueError: |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 96 | raise RuntimeError("Unpacking problem", l) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 97 | else: |
| 98 | start = end = fields[0] |
| 99 | start = int(start, 16) |
| 100 | end = int(end, 16) |
| 101 | for i in range(start, end+1): |
| 102 | table[i] = i |
| 103 | else: |
| 104 | code, value = fields |
| 105 | value = value.strip() |
| 106 | if value: |
| 107 | value = [int(v, 16) for v in value.split(" ")] |
| 108 | else: |
| 109 | # table B.1 |
| 110 | value = None |
| 111 | table[int(code, 16)] = value |
| 112 | |
| 113 | ########### Generate compact Python versions of the tables ############# |
| 114 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 115 | print("""# This file is generated by mkstringprep.py. DO NOT EDIT. |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 116 | \"\"\"Library that exposes various tables found in the StringPrep RFC 3454. |
| 117 | |
| 118 | There are two kinds of tables: sets, for which a member test is provided, |
| 119 | and mappings, for which a mapping function is provided. |
| 120 | \"\"\" |
| 121 | |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 122 | from unicodedata import ucd_3_2_0 as unicodedata |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 123 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 124 | |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 125 | print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 126 | |
| 127 | # A.1 is the table of unassigned characters |
| 128 | # XXX Plane 15 PUA is listed as unassigned in Python. |
| 129 | name, table = tables[0] |
| 130 | del tables[0] |
| 131 | assert name == "A.1" |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 132 | table = set(table.keys()) |
| 133 | Cn = set(gen_category(["Cn"])) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 134 | |
| 135 | # FDD0..FDEF are process internal codes |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 136 | Cn -= set(range(0xFDD0, 0xFDF0)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 137 | # not a character |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 138 | Cn -= set(range(0xFFFE, 0x110000, 0x10000)) |
| 139 | Cn -= set(range(0xFFFF, 0x110000, 0x10000)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 140 | |
| 141 | # assert table == Cn |
| 142 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 143 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 144 | def in_table_a1(code): |
| 145 | if unicodedata.category(code) != 'Cn': return False |
| 146 | c = ord(code) |
| 147 | if 0xFDD0 <= c < 0xFDF0: return False |
| 148 | return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 149 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 150 | |
| 151 | # B.1 cannot easily be derived |
| 152 | name, table = tables[0] |
| 153 | del tables[0] |
| 154 | assert name == "B.1" |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 155 | table = sorted(table.keys()) |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 156 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 157 | b1_set = """ + compact_set(table) + """ |
| 158 | def in_table_b1(code): |
| 159 | return ord(code) in b1_set |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 160 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 161 | |
| 162 | # B.2 and B.3 is case folding. |
| 163 | # It takes CaseFolding.txt into account, which is |
| 164 | # not available in the Python database. Since |
| 165 | # B.2 is derived from B.3, we process B.3 first. |
| 166 | # B.3 supposedly *is* CaseFolding-3.2.0.txt. |
| 167 | |
| 168 | name, table_b2 = tables[0] |
| 169 | del tables[0] |
| 170 | assert name == "B.2" |
| 171 | |
| 172 | name, table_b3 = tables[0] |
| 173 | del tables[0] |
| 174 | assert name == "B.3" |
| 175 | |
| 176 | # B.3 is mostly Python's .lower, except for a number |
| 177 | # of special cases, e.g. considering canonical forms. |
| 178 | |
| 179 | b3_exceptions = {} |
| 180 | |
| 181 | for k,v in table_b2.items(): |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 182 | if list(map(ord, chr(k).lower())) != v: |
| 183 | b3_exceptions[k] = "".join(map(chr,v)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 184 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 185 | b3 = sorted(b3_exceptions.items()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 186 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 187 | print(""" |
| 188 | b3_exceptions = {""") |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 189 | for i, kv in enumerate(b3): |
| 190 | print("0x%x:%a," % kv, end=' ') |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 191 | if i % 4 == 3: |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 192 | print() |
| 193 | print("}") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 194 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 195 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 196 | def map_table_b3(code): |
| 197 | r = b3_exceptions.get(ord(code)) |
| 198 | if r is not None: return r |
| 199 | return code.lower() |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 200 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 201 | |
| 202 | def map_table_b3(code): |
| 203 | r = b3_exceptions.get(ord(code)) |
| 204 | if r is not None: return r |
| 205 | return code.lower() |
| 206 | |
| 207 | # B.2 is case folding for NFKC. This is the same as B.3, |
| 208 | # except where NormalizeWithKC(Fold(a)) != |
| 209 | # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a)))) |
| 210 | |
| 211 | def map_table_b2(a): |
| 212 | al = map_table_b3(a) |
| 213 | b = unicodedata.normalize("NFKC", al) |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 214 | bl = "".join([map_table_b3(ch) for ch in b]) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 215 | c = unicodedata.normalize("NFKC", bl) |
| 216 | if b != c: |
| 217 | return c |
| 218 | else: |
| 219 | return al |
| 220 | |
| 221 | specials = {} |
| 222 | for k,v in table_b2.items(): |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 223 | if list(map(ord, map_table_b2(chr(k)))) != v: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 224 | specials[k] = v |
| 225 | |
| 226 | # B.3 should not add any additional special cases |
| 227 | assert specials == {} |
| 228 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 229 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 230 | def map_table_b2(a): |
| 231 | al = map_table_b3(a) |
| 232 | b = unicodedata.normalize("NFKC", al) |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 233 | bl = "".join([map_table_b3(ch) for ch in b]) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 234 | c = unicodedata.normalize("NFKC", bl) |
| 235 | if b != c: |
| 236 | return c |
| 237 | else: |
| 238 | return al |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 239 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 240 | |
| 241 | # C.1.1 is a table with a single character |
| 242 | name, table = tables[0] |
| 243 | del tables[0] |
| 244 | assert name == "C.1.1" |
| 245 | assert table == {0x20:0x20} |
| 246 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 247 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 248 | def in_table_c11(code): |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 249 | return code == " " |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 250 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 251 | |
| 252 | # C.1.2 is the rest of all space characters |
| 253 | name, table = tables[0] |
| 254 | del tables[0] |
| 255 | assert name == "C.1.2" |
| 256 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 257 | # table = set(table.keys()) |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 258 | # Zs = set(gen_category(["Zs"])) - {0x20} |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 259 | # assert Zs == table |
| 260 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 261 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 262 | def in_table_c12(code): |
Serhiy Storchaka | e7275ff | 2013-06-09 17:08:00 +0300 | [diff] [blame] | 263 | return unicodedata.category(code) == "Zs" and code != " " |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 264 | |
| 265 | def in_table_c11_c12(code): |
| 266 | return unicodedata.category(code) == "Zs" |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 267 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 268 | |
| 269 | # C.2.1 ASCII control characters |
| 270 | name, table_c21 = tables[0] |
| 271 | del tables[0] |
| 272 | assert name == "C.2.1" |
| 273 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 274 | Cc = set(gen_category(["Cc"])) |
| 275 | Cc_ascii = Cc & set(range(128)) |
| 276 | table_c21 = set(table_c21.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 277 | assert Cc_ascii == table_c21 |
| 278 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 279 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 280 | def in_table_c21(code): |
| 281 | return ord(code) < 128 and unicodedata.category(code) == "Cc" |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 282 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 283 | |
| 284 | # C.2.2 Non-ASCII control characters. It also includes |
| 285 | # a number of characters in category Cf. |
| 286 | name, table_c22 = tables[0] |
| 287 | del tables[0] |
| 288 | assert name == "C.2.2" |
| 289 | |
| 290 | Cc_nonascii = Cc - Cc_ascii |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 291 | table_c22 = set(table_c22.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 292 | assert len(Cc_nonascii - table_c22) == 0 |
| 293 | |
| 294 | specials = list(table_c22 - Cc_nonascii) |
| 295 | specials.sort() |
| 296 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 297 | print("""c22_specials = """ + compact_set(specials) + """ |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 298 | def in_table_c22(code): |
| 299 | c = ord(code) |
| 300 | if c < 128: return False |
| 301 | if unicodedata.category(code) == "Cc": return True |
| 302 | return c in c22_specials |
| 303 | |
| 304 | def in_table_c21_c22(code): |
| 305 | return unicodedata.category(code) == "Cc" or \\ |
| 306 | ord(code) in c22_specials |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 307 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 308 | |
| 309 | # C.3 Private use |
| 310 | name, table = tables[0] |
| 311 | del tables[0] |
| 312 | assert name == "C.3" |
| 313 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 314 | Co = set(gen_category(["Co"])) |
| 315 | assert set(table.keys()) == Co |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 316 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 317 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 318 | def in_table_c3(code): |
| 319 | return unicodedata.category(code) == "Co" |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 320 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 321 | |
| 322 | # C.4 Non-character code points, xFFFE, xFFFF |
| 323 | # plus process internal codes |
| 324 | name, table = tables[0] |
| 325 | del tables[0] |
| 326 | assert name == "C.4" |
| 327 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 328 | nonchar = set(range(0xFDD0,0xFDF0)) |
| 329 | nonchar.update(range(0xFFFE,0x110000,0x10000)) |
| 330 | nonchar.update(range(0xFFFF,0x110000,0x10000)) |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 331 | table = set(table.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 332 | assert table == nonchar |
| 333 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 334 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 335 | def in_table_c4(code): |
| 336 | c = ord(code) |
| 337 | if c < 0xFDD0: return False |
| 338 | if c < 0xFDF0: return True |
| 339 | return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 340 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 341 | |
| 342 | # C.5 Surrogate codes |
| 343 | name, table = tables[0] |
| 344 | del tables[0] |
| 345 | assert name == "C.5" |
| 346 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 347 | Cs = set(gen_category(["Cs"])) |
| 348 | assert set(table.keys()) == Cs |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 349 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 350 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 351 | def in_table_c5(code): |
| 352 | return unicodedata.category(code) == "Cs" |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 353 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 354 | |
| 355 | # C.6 Inappropriate for plain text |
| 356 | name, table = tables[0] |
| 357 | del tables[0] |
| 358 | assert name == "C.6" |
| 359 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 360 | table = sorted(table.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 361 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 362 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 363 | c6_set = """ + compact_set(table) + """ |
| 364 | def in_table_c6(code): |
| 365 | return ord(code) in c6_set |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 366 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 367 | |
| 368 | # C.7 Inappropriate for canonical representation |
| 369 | name, table = tables[0] |
| 370 | del tables[0] |
| 371 | assert name == "C.7" |
| 372 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 373 | table = sorted(table.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 374 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 375 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 376 | c7_set = """ + compact_set(table) + """ |
| 377 | def in_table_c7(code): |
| 378 | return ord(code) in c7_set |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 379 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 380 | |
| 381 | # C.8 Change display properties or are deprecated |
| 382 | name, table = tables[0] |
| 383 | del tables[0] |
| 384 | assert name == "C.8" |
| 385 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 386 | table = sorted(table.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 387 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 388 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 389 | c8_set = """ + compact_set(table) + """ |
| 390 | def in_table_c8(code): |
| 391 | return ord(code) in c8_set |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 392 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 393 | |
| 394 | # C.9 Tagging characters |
| 395 | name, table = tables[0] |
| 396 | del tables[0] |
| 397 | assert name == "C.9" |
| 398 | |
Georg Brandl | bf82e37 | 2008-05-16 17:02:34 +0000 | [diff] [blame] | 399 | table = sorted(table.keys()) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 400 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 401 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 402 | c9_set = """ + compact_set(table) + """ |
| 403 | def in_table_c9(code): |
| 404 | return ord(code) in c9_set |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 405 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 406 | |
| 407 | # D.1 Characters with bidirectional property "R" or "AL" |
| 408 | name, table = tables[0] |
| 409 | del tables[0] |
| 410 | assert name == "D.1" |
| 411 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 412 | RandAL = set(gen_bidirectional(["R","AL"])) |
| 413 | assert set(table.keys()) == RandAL |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 414 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 415 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 416 | def in_table_d1(code): |
| 417 | return unicodedata.bidirectional(code) in ("R","AL") |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 418 | """) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 419 | |
| 420 | # D.2 Characters with bidirectional property "L" |
| 421 | name, table = tables[0] |
| 422 | del tables[0] |
| 423 | assert name == "D.2" |
| 424 | |
Armin Rigo | ba91b9f | 2004-05-19 19:10:18 +0000 | [diff] [blame] | 425 | L = set(gen_bidirectional(["L"])) |
| 426 | assert set(table.keys()) == L |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 427 | |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 428 | print(""" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 429 | def in_table_d2(code): |
| 430 | return unicodedata.bidirectional(code) == "L" |
Collin Winter | 6afaeb7 | 2007-08-03 17:06:41 +0000 | [diff] [blame] | 431 | """) |