blob: fde710b395f4d9b97aee290d0b7e3886f149ddba [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003import stringprep, re, codecs
4from unicodedata import db_3_2_0 as unicodedata
Martin v. Löwis2548c732003-04-18 10:39:54 +00005
6# IDNA section 3.1
7dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
8
9# IDNA section 5
10ace_prefix = "xn--"
11uace_prefix = unicode(ace_prefix, "ascii")
12
13# This assumes query strings, so AllowUnassigned is true
14def nameprep(label):
15 # Map
16 newlabel = []
17 for c in label:
18 if stringprep.in_table_b1(c):
19 # Map to nothing
20 continue
21 newlabel.append(stringprep.map_table_b2(c))
22 label = u"".join(newlabel)
Tim Peters0eadaac2003-04-24 16:02:54 +000023
Martin v. Löwis2548c732003-04-18 10:39:54 +000024 # Normalize
25 label = unicodedata.normalize("NFKC", label)
Tim Peters0eadaac2003-04-24 16:02:54 +000026
Martin v. Löwis2548c732003-04-18 10:39:54 +000027 # Prohibit
28 for c in label:
29 if stringprep.in_table_c12(c) or \
30 stringprep.in_table_c22(c) or \
31 stringprep.in_table_c3(c) or \
32 stringprep.in_table_c4(c) or \
33 stringprep.in_table_c5(c) or \
34 stringprep.in_table_c6(c) or \
35 stringprep.in_table_c7(c) or \
36 stringprep.in_table_c8(c) or \
37 stringprep.in_table_c9(c):
38 raise UnicodeError, "Invalid character %s" % repr(c)
39
40 # Check bidi
41 RandAL = map(stringprep.in_table_d1, label)
42 for c in RandAL:
43 if c:
44 # There is a RandAL char in the string. Must perform further
45 # tests:
46 # 1) The characters in section 5.8 MUST be prohibited.
47 # This is table C.8, which was already checked
48 # 2) If a string contains any RandALCat character, the string
49 # MUST NOT contain any LCat character.
50 if filter(stringprep.in_table_d2, label):
51 raise UnicodeError, "Violation of BIDI requirement 2"
52
53 # 3) If a string contains any RandALCat character, a
54 # RandALCat character MUST be the first character of the
55 # string, and a RandALCat character MUST be the last
56 # character of the string.
57 if not RandAL[0] or not RandAL[-1]:
58 raise UnicodeError, "Violation of BIDI requirement 3"
59
60 return label
61
62def ToASCII(label):
63 try:
64 # Step 1: try ASCII
65 label = label.encode("ascii")
66 except UnicodeError:
67 pass
68 else:
69 # Skip to step 3: UseSTD3ASCIIRules is false, so
70 # Skip to step 8.
71 if 0 < len(label) < 64:
72 return label
73 raise UnicodeError, "label too long"
74
75 # Step 2: nameprep
76 label = nameprep(label)
77
78 # Step 3: UseSTD3ASCIIRules is false
79 # Step 4: try ASCII
80 try:
81 label = label.encode("ascii")
82 except UnicodeError:
83 pass
84 else:
85 # Skip to step 8.
86 if 0 < len(label) < 64:
87 return label
88 raise UnicodeError, "label too long"
89
90 # Step 5: Check ACE prefix
91 if label.startswith(uace_prefix):
92 raise UnicodeError, "Label starts with ACE prefix"
93
94 # Step 6: Encode with PUNYCODE
95 label = label.encode("punycode")
96
97 # Step 7: Prepend ACE prefix
98 label = ace_prefix + label
99
100 # Step 8: Check size
101 if 0 < len(label) < 64:
102 return label
103 raise UnicodeError, "label too long"
104
105def ToUnicode(label):
106 # Step 1: Check for ASCII
107 if isinstance(label, str):
108 pure_ascii = True
109 else:
110 try:
111 label = label.encode("ascii")
112 pure_ascii = True
113 except UnicodeError:
114 pure_ascii = False
115 if not pure_ascii:
116 # Step 2: Perform nameprep
117 label = nameprep(label)
118 # It doesn't say this, but apparently, it should be ASCII now
119 try:
120 label = label.encode("ascii")
121 except UnicodeError:
122 raise UnicodeError, "Invalid character in IDN label"
123 # Step 3: Check for ACE prefix
124 if not label.startswith(ace_prefix):
125 return unicode(label, "ascii")
126
127 # Step 4: Remove ACE prefix
128 label1 = label[len(ace_prefix):]
129
130 # Step 5: Decode using PUNYCODE
131 result = label1.decode("punycode")
132
133 # Step 6: Apply ToASCII
134 label2 = ToASCII(result)
135
136 # Step 7: Compare the result of step 6 with the one of step 3
137 # label2 will already be in lower case.
138 if label.lower() != label2:
139 raise UnicodeError, ("IDNA does not round-trip", label, label2)
140
141 # Step 8: return the result of step 5
142 return result
Tim Peters0eadaac2003-04-24 16:02:54 +0000143
Martin v. Löwis2548c732003-04-18 10:39:54 +0000144### Codec APIs
145
146class Codec(codecs.Codec):
147 def encode(self,input,errors='strict'):
148
149 if errors != 'strict':
150 # IDNA is quite clear that implementations must be strict
151 raise UnicodeError, "unsupported error handling "+errors
152
Martin v. Löwis8b595142005-08-25 11:03:38 +0000153 if not input:
154 return "", 0
155
Martin v. Löwis2548c732003-04-18 10:39:54 +0000156 result = []
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000157 labels = dots.split(input)
158 if labels and len(labels[-1])==0:
159 trailing_dot = '.'
160 del labels[-1]
161 else:
162 trailing_dot = ''
163 for label in labels:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000164 result.append(ToASCII(label))
165 # Join with U+002E
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000166 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000167
168 def decode(self,input,errors='strict'):
Tim Peters0eadaac2003-04-24 16:02:54 +0000169
Martin v. Löwis2548c732003-04-18 10:39:54 +0000170 if errors != 'strict':
171 raise UnicodeError, "Unsupported error handling "+errors
172
Martin v. Löwis8b595142005-08-25 11:03:38 +0000173 if not input:
174 return u"", 0
175
Martin v. Löwis2548c732003-04-18 10:39:54 +0000176 # IDNA allows decoding to operate on Unicode strings, too.
177 if isinstance(input, unicode):
178 labels = dots.split(input)
179 else:
180 # Must be ASCII string
Martin v. Löwis708b4da2004-03-23 23:40:36 +0000181 input = str(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000182 unicode(input, "ascii")
183 labels = input.split(".")
184
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000185 if labels and len(labels[-1]) == 0:
186 trailing_dot = u'.'
187 del labels[-1]
188 else:
189 trailing_dot = u''
190
Martin v. Löwis2548c732003-04-18 10:39:54 +0000191 result = []
192 for label in labels:
193 result.append(ToUnicode(label))
194
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000195 return u".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000196
197class StreamWriter(Codec,codecs.StreamWriter):
198 pass
199
200class StreamReader(Codec,codecs.StreamReader):
201 pass
202
203### encodings module API
204
205def getregentry():
206
207 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)