blob: ead020c3a7a4bbc75ffc96f3f23fe1d798d38d08 [file] [log] [blame]
Serhiy Storchakae7275ff2013-06-09 17:08:00 +03001import re, sys
2from unicodedata import ucd_3_2_0 as unicodedata
Martin v. Löwis2548c732003-04-18 10:39:54 +00003
4if sys.maxunicode == 65535:
Collin Wintera817e582007-08-22 23:05:06 +00005 raise RuntimeError("need UCS-4 Python")
Martin v. Löwis2548c732003-04-18 10:39:54 +00006
7def gen_category(cats):
8 for i in range(0, 0x110000):
Georg Brandlbf82e372008-05-16 17:02:34 +00009 if unicodedata.category(chr(i)) in cats:
Martin v. Löwis2548c732003-04-18 10:39:54 +000010 yield(i)
11
12def gen_bidirectional(cats):
13 for i in range(0, 0x110000):
Georg Brandlbf82e372008-05-16 17:02:34 +000014 if unicodedata.bidirectional(chr(i)) in cats:
Martin v. Löwis2548c732003-04-18 10:39:54 +000015 yield(i)
16
17def compact_set(l):
18 single = []
19 tuple = []
20 prev = None
21 span = 0
22 for e in l:
23 if prev is None:
24 prev = e
25 span = 0
26 continue
27 if prev+span+1 != e:
28 if span > 2:
29 tuple.append((prev,prev+span+1))
30 else:
31 for i in range(prev, prev+span+1):
32 single.append(i)
33 prev = e
34 span = 0
35 else:
36 span += 1
37 if span:
38 tuple.append((prev,prev+span+1))
39 else:
40 single.append(prev)
Serhiy Storchakae7275ff2013-06-09 17:08:00 +030041 if not single and len(tuple) == 1:
42 tuple = "range(%d,%d)" % tuple[0]
43 else:
44 tuple = " + ".join("list(range(%d,%d))" % t for t in tuple)
Martin v. Löwis2548c732003-04-18 10:39:54 +000045 if not single:
Armin Rigoba91b9f2004-05-19 19:10:18 +000046 return "set(%s)" % tuple
Martin v. Löwis2548c732003-04-18 10:39:54 +000047 if not tuple:
Serhiy Storchakae7275ff2013-06-09 17:08:00 +030048 return "set(%r)" % (single,)
49 return "set(%r + %s)" % (single, tuple)
Martin v. Löwis2548c732003-04-18 10:39:54 +000050
51############## Read the tables in the RFC #######################
52
Serhiy Storchakae7275ff2013-06-09 17:08:00 +030053with open("rfc3454.txt") as f:
54 data = f.readlines()
Martin v. Löwis2548c732003-04-18 10:39:54 +000055
56tables = []
57curname = None
58for l in data:
59 l = l.strip()
60 if not l:
61 continue
62 # Skip RFC page breaks
Serhiy Storchakae7275ff2013-06-09 17:08:00 +030063 if l.startswith(("Hoffman & Blanchet", "RFC 3454")):
Martin v. Löwis2548c732003-04-18 10:39:54 +000064 continue
65 # Find start/end lines
66 m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
67 if m:
68 if m.group(1) == "Start":
69 if curname:
Georg Brandlbf82e372008-05-16 17:02:34 +000070 raise RuntimeError("Double Start", (curname, l))
Martin v. Löwis2548c732003-04-18 10:39:54 +000071 curname = m.group(2)
72 table = {}
73 tables.append((curname, table))
74 continue
75 else:
76 if not curname:
Georg Brandlbf82e372008-05-16 17:02:34 +000077 raise RuntimeError("End without start", l)
Serhiy Storchakae7275ff2013-06-09 17:08:00 +030078 if curname != m.group(2):
79 raise RuntimeError("Unexpected end", l)
Martin v. Löwis2548c732003-04-18 10:39:54 +000080 curname = None
81 continue
82 if not curname:
83 continue
84 # Now we are in a table
85 fields = l.split(";")
86 if len(fields) > 1:
87 # Drop comment field
88 fields = fields[:-1]
89 if len(fields) == 1:
90 fields = fields[0].split("-")
91 if len(fields) > 1:
92 # range
93 try:
94 start, end = fields
95 except ValueError:
Georg Brandlbf82e372008-05-16 17:02:34 +000096 raise RuntimeError("Unpacking problem", l)
Martin v. Löwis2548c732003-04-18 10:39:54 +000097 else:
98 start = end = fields[0]
99 start = int(start, 16)
100 end = int(end, 16)
101 for i in range(start, end+1):
102 table[i] = i
103 else:
104 code, value = fields
105 value = value.strip()
106 if value:
107 value = [int(v, 16) for v in value.split(" ")]
108 else:
109 # table B.1
110 value = None
111 table[int(code, 16)] = value
112
113########### Generate compact Python versions of the tables #############
114
Collin Winter6afaeb72007-08-03 17:06:41 +0000115print("""# This file is generated by mkstringprep.py. DO NOT EDIT.
Martin v. Löwis2548c732003-04-18 10:39:54 +0000116\"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
117
118There are two kinds of tables: sets, for which a member test is provided,
119and mappings, for which a mapping function is provided.
120\"\"\"
121
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300122from unicodedata import ucd_3_2_0 as unicodedata
Collin Winter6afaeb72007-08-03 17:06:41 +0000123""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000124
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300125print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000126
127# A.1 is the table of unassigned characters
128# XXX Plane 15 PUA is listed as unassigned in Python.
129name, table = tables[0]
130del tables[0]
131assert name == "A.1"
Armin Rigoba91b9f2004-05-19 19:10:18 +0000132table = set(table.keys())
133Cn = set(gen_category(["Cn"]))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000134
135# FDD0..FDEF are process internal codes
Armin Rigoba91b9f2004-05-19 19:10:18 +0000136Cn -= set(range(0xFDD0, 0xFDF0))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000137# not a character
Armin Rigoba91b9f2004-05-19 19:10:18 +0000138Cn -= set(range(0xFFFE, 0x110000, 0x10000))
139Cn -= set(range(0xFFFF, 0x110000, 0x10000))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000140
141# assert table == Cn
142
Collin Winter6afaeb72007-08-03 17:06:41 +0000143print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000144def in_table_a1(code):
145 if unicodedata.category(code) != 'Cn': return False
146 c = ord(code)
147 if 0xFDD0 <= c < 0xFDF0: return False
148 return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
Collin Winter6afaeb72007-08-03 17:06:41 +0000149""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000150
151# B.1 cannot easily be derived
152name, table = tables[0]
153del tables[0]
154assert name == "B.1"
Georg Brandlbf82e372008-05-16 17:02:34 +0000155table = sorted(table.keys())
Collin Winter6afaeb72007-08-03 17:06:41 +0000156print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000157b1_set = """ + compact_set(table) + """
158def in_table_b1(code):
159 return ord(code) in b1_set
Collin Winter6afaeb72007-08-03 17:06:41 +0000160""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000161
162# B.2 and B.3 is case folding.
163# It takes CaseFolding.txt into account, which is
164# not available in the Python database. Since
165# B.2 is derived from B.3, we process B.3 first.
166# B.3 supposedly *is* CaseFolding-3.2.0.txt.
167
168name, table_b2 = tables[0]
169del tables[0]
170assert name == "B.2"
171
172name, table_b3 = tables[0]
173del tables[0]
174assert name == "B.3"
175
176# B.3 is mostly Python's .lower, except for a number
177# of special cases, e.g. considering canonical forms.
178
179b3_exceptions = {}
180
181for k,v in table_b2.items():
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300182 if list(map(ord, chr(k).lower())) != v:
183 b3_exceptions[k] = "".join(map(chr,v))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000184
Georg Brandlbf82e372008-05-16 17:02:34 +0000185b3 = sorted(b3_exceptions.items())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000186
Collin Winter6afaeb72007-08-03 17:06:41 +0000187print("""
188b3_exceptions = {""")
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300189for i, kv in enumerate(b3):
190 print("0x%x:%a," % kv, end=' ')
Martin v. Löwis2548c732003-04-18 10:39:54 +0000191 if i % 4 == 3:
Collin Winter6afaeb72007-08-03 17:06:41 +0000192 print()
193print("}")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000194
Collin Winter6afaeb72007-08-03 17:06:41 +0000195print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000196def map_table_b3(code):
197 r = b3_exceptions.get(ord(code))
198 if r is not None: return r
199 return code.lower()
Collin Winter6afaeb72007-08-03 17:06:41 +0000200""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000201
202def map_table_b3(code):
203 r = b3_exceptions.get(ord(code))
204 if r is not None: return r
205 return code.lower()
206
207# B.2 is case folding for NFKC. This is the same as B.3,
208# except where NormalizeWithKC(Fold(a)) !=
209# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
210
211def map_table_b2(a):
212 al = map_table_b3(a)
213 b = unicodedata.normalize("NFKC", al)
Georg Brandlbf82e372008-05-16 17:02:34 +0000214 bl = "".join([map_table_b3(ch) for ch in b])
Martin v. Löwis2548c732003-04-18 10:39:54 +0000215 c = unicodedata.normalize("NFKC", bl)
216 if b != c:
217 return c
218 else:
219 return al
220
221specials = {}
222for k,v in table_b2.items():
Georg Brandlbf82e372008-05-16 17:02:34 +0000223 if list(map(ord, map_table_b2(chr(k)))) != v:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000224 specials[k] = v
225
226# B.3 should not add any additional special cases
227assert specials == {}
228
Collin Winter6afaeb72007-08-03 17:06:41 +0000229print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000230def map_table_b2(a):
231 al = map_table_b3(a)
232 b = unicodedata.normalize("NFKC", al)
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300233 bl = "".join([map_table_b3(ch) for ch in b])
Martin v. Löwis2548c732003-04-18 10:39:54 +0000234 c = unicodedata.normalize("NFKC", bl)
235 if b != c:
236 return c
237 else:
238 return al
Collin Winter6afaeb72007-08-03 17:06:41 +0000239""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000240
241# C.1.1 is a table with a single character
242name, table = tables[0]
243del tables[0]
244assert name == "C.1.1"
245assert table == {0x20:0x20}
246
Collin Winter6afaeb72007-08-03 17:06:41 +0000247print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000248def in_table_c11(code):
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300249 return code == " "
Collin Winter6afaeb72007-08-03 17:06:41 +0000250""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000251
252# C.1.2 is the rest of all space characters
253name, table = tables[0]
254del tables[0]
255assert name == "C.1.2"
256
Armin Rigoba91b9f2004-05-19 19:10:18 +0000257# table = set(table.keys())
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300258# Zs = set(gen_category(["Zs"])) - {0x20}
Martin v. Löwis2548c732003-04-18 10:39:54 +0000259# assert Zs == table
260
Collin Winter6afaeb72007-08-03 17:06:41 +0000261print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000262def in_table_c12(code):
Serhiy Storchakae7275ff2013-06-09 17:08:00 +0300263 return unicodedata.category(code) == "Zs" and code != " "
Martin v. Löwis2548c732003-04-18 10:39:54 +0000264
265def in_table_c11_c12(code):
266 return unicodedata.category(code) == "Zs"
Collin Winter6afaeb72007-08-03 17:06:41 +0000267""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000268
269# C.2.1 ASCII control characters
270name, table_c21 = tables[0]
271del tables[0]
272assert name == "C.2.1"
273
Armin Rigoba91b9f2004-05-19 19:10:18 +0000274Cc = set(gen_category(["Cc"]))
275Cc_ascii = Cc & set(range(128))
276table_c21 = set(table_c21.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000277assert Cc_ascii == table_c21
278
Collin Winter6afaeb72007-08-03 17:06:41 +0000279print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000280def in_table_c21(code):
281 return ord(code) < 128 and unicodedata.category(code) == "Cc"
Collin Winter6afaeb72007-08-03 17:06:41 +0000282""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000283
284# C.2.2 Non-ASCII control characters. It also includes
285# a number of characters in category Cf.
286name, table_c22 = tables[0]
287del tables[0]
288assert name == "C.2.2"
289
290Cc_nonascii = Cc - Cc_ascii
Armin Rigoba91b9f2004-05-19 19:10:18 +0000291table_c22 = set(table_c22.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000292assert len(Cc_nonascii - table_c22) == 0
293
294specials = list(table_c22 - Cc_nonascii)
295specials.sort()
296
Collin Winter6afaeb72007-08-03 17:06:41 +0000297print("""c22_specials = """ + compact_set(specials) + """
Martin v. Löwis2548c732003-04-18 10:39:54 +0000298def in_table_c22(code):
299 c = ord(code)
300 if c < 128: return False
301 if unicodedata.category(code) == "Cc": return True
302 return c in c22_specials
303
304def in_table_c21_c22(code):
305 return unicodedata.category(code) == "Cc" or \\
306 ord(code) in c22_specials
Collin Winter6afaeb72007-08-03 17:06:41 +0000307""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000308
309# C.3 Private use
310name, table = tables[0]
311del tables[0]
312assert name == "C.3"
313
Armin Rigoba91b9f2004-05-19 19:10:18 +0000314Co = set(gen_category(["Co"]))
315assert set(table.keys()) == Co
Martin v. Löwis2548c732003-04-18 10:39:54 +0000316
Collin Winter6afaeb72007-08-03 17:06:41 +0000317print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000318def in_table_c3(code):
319 return unicodedata.category(code) == "Co"
Collin Winter6afaeb72007-08-03 17:06:41 +0000320""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000321
322# C.4 Non-character code points, xFFFE, xFFFF
323# plus process internal codes
324name, table = tables[0]
325del tables[0]
326assert name == "C.4"
327
Georg Brandlbf82e372008-05-16 17:02:34 +0000328nonchar = set(range(0xFDD0,0xFDF0))
329nonchar.update(range(0xFFFE,0x110000,0x10000))
330nonchar.update(range(0xFFFF,0x110000,0x10000))
Armin Rigoba91b9f2004-05-19 19:10:18 +0000331table = set(table.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000332assert table == nonchar
333
Collin Winter6afaeb72007-08-03 17:06:41 +0000334print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000335def in_table_c4(code):
336 c = ord(code)
337 if c < 0xFDD0: return False
338 if c < 0xFDF0: return True
339 return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
Collin Winter6afaeb72007-08-03 17:06:41 +0000340""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000341
342# C.5 Surrogate codes
343name, table = tables[0]
344del tables[0]
345assert name == "C.5"
346
Armin Rigoba91b9f2004-05-19 19:10:18 +0000347Cs = set(gen_category(["Cs"]))
348assert set(table.keys()) == Cs
Martin v. Löwis2548c732003-04-18 10:39:54 +0000349
Collin Winter6afaeb72007-08-03 17:06:41 +0000350print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000351def in_table_c5(code):
352 return unicodedata.category(code) == "Cs"
Collin Winter6afaeb72007-08-03 17:06:41 +0000353""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000354
355# C.6 Inappropriate for plain text
356name, table = tables[0]
357del tables[0]
358assert name == "C.6"
359
Georg Brandlbf82e372008-05-16 17:02:34 +0000360table = sorted(table.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000361
Collin Winter6afaeb72007-08-03 17:06:41 +0000362print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000363c6_set = """ + compact_set(table) + """
364def in_table_c6(code):
365 return ord(code) in c6_set
Collin Winter6afaeb72007-08-03 17:06:41 +0000366""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000367
368# C.7 Inappropriate for canonical representation
369name, table = tables[0]
370del tables[0]
371assert name == "C.7"
372
Georg Brandlbf82e372008-05-16 17:02:34 +0000373table = sorted(table.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000374
Collin Winter6afaeb72007-08-03 17:06:41 +0000375print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000376c7_set = """ + compact_set(table) + """
377def in_table_c7(code):
378 return ord(code) in c7_set
Collin Winter6afaeb72007-08-03 17:06:41 +0000379""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000380
381# C.8 Change display properties or are deprecated
382name, table = tables[0]
383del tables[0]
384assert name == "C.8"
385
Georg Brandlbf82e372008-05-16 17:02:34 +0000386table = sorted(table.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000387
Collin Winter6afaeb72007-08-03 17:06:41 +0000388print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000389c8_set = """ + compact_set(table) + """
390def in_table_c8(code):
391 return ord(code) in c8_set
Collin Winter6afaeb72007-08-03 17:06:41 +0000392""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000393
394# C.9 Tagging characters
395name, table = tables[0]
396del tables[0]
397assert name == "C.9"
398
Georg Brandlbf82e372008-05-16 17:02:34 +0000399table = sorted(table.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000400
Collin Winter6afaeb72007-08-03 17:06:41 +0000401print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000402c9_set = """ + compact_set(table) + """
403def in_table_c9(code):
404 return ord(code) in c9_set
Collin Winter6afaeb72007-08-03 17:06:41 +0000405""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000406
407# D.1 Characters with bidirectional property "R" or "AL"
408name, table = tables[0]
409del tables[0]
410assert name == "D.1"
411
Armin Rigoba91b9f2004-05-19 19:10:18 +0000412RandAL = set(gen_bidirectional(["R","AL"]))
413assert set(table.keys()) == RandAL
Martin v. Löwis2548c732003-04-18 10:39:54 +0000414
Collin Winter6afaeb72007-08-03 17:06:41 +0000415print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000416def in_table_d1(code):
417 return unicodedata.bidirectional(code) in ("R","AL")
Collin Winter6afaeb72007-08-03 17:06:41 +0000418""")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000419
420# D.2 Characters with bidirectional property "L"
421name, table = tables[0]
422del tables[0]
423assert name == "D.2"
424
Armin Rigoba91b9f2004-05-19 19:10:18 +0000425L = set(gen_bidirectional(["L"]))
426assert set(table.keys()) == L
Martin v. Löwis2548c732003-04-18 10:39:54 +0000427
Collin Winter6afaeb72007-08-03 17:06:41 +0000428print("""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000429def in_table_d2(code):
430 return unicodedata.bidirectional(code) == "L"
Collin Winter6afaeb72007-08-03 17:06:41 +0000431""")