blob: ea4058512fe366d40ec150a943d8d73b4861be22 [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003import stringprep, re, codecs
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00004from unicodedata import ucd_3_2_0 as unicodedata
Martin v. Löwis2548c732003-04-18 10:39:54 +00005
6# IDNA section 3.1
Guido van Rossumef87d6e2007-05-02 19:09:54 +00007dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis2548c732003-04-18 10:39:54 +00008
9# IDNA section 5
Walter Dörwald0ac30f82007-05-11 10:32:57 +000010ace_prefix = b"xn--"
11sace_prefix = "xn--"
Martin v. Löwis2548c732003-04-18 10:39:54 +000012
13# This assumes query strings, so AllowUnassigned is true
14def nameprep(label):
15 # Map
16 newlabel = []
17 for c in label:
18 if stringprep.in_table_b1(c):
19 # Map to nothing
20 continue
21 newlabel.append(stringprep.map_table_b2(c))
Guido van Rossumef87d6e2007-05-02 19:09:54 +000022 label = "".join(newlabel)
Tim Peters0eadaac2003-04-24 16:02:54 +000023
Martin v. Löwis2548c732003-04-18 10:39:54 +000024 # Normalize
25 label = unicodedata.normalize("NFKC", label)
Tim Peters0eadaac2003-04-24 16:02:54 +000026
Martin v. Löwis2548c732003-04-18 10:39:54 +000027 # Prohibit
28 for c in label:
29 if stringprep.in_table_c12(c) or \
30 stringprep.in_table_c22(c) or \
31 stringprep.in_table_c3(c) or \
32 stringprep.in_table_c4(c) or \
33 stringprep.in_table_c5(c) or \
34 stringprep.in_table_c6(c) or \
35 stringprep.in_table_c7(c) or \
36 stringprep.in_table_c8(c) or \
37 stringprep.in_table_c9(c):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000038 raise UnicodeError("Invalid character %r" % c)
Martin v. Löwis2548c732003-04-18 10:39:54 +000039
40 # Check bidi
Guido van Rossumc1f779c2007-07-03 08:25:58 +000041 RandAL = [stringprep.in_table_d1(x) for x in label]
Martin v. Löwis2548c732003-04-18 10:39:54 +000042 for c in RandAL:
43 if c:
44 # There is a RandAL char in the string. Must perform further
45 # tests:
46 # 1) The characters in section 5.8 MUST be prohibited.
47 # This is table C.8, which was already checked
48 # 2) If a string contains any RandALCat character, the string
49 # MUST NOT contain any LCat character.
Guido van Rossumc1f779c2007-07-03 08:25:58 +000050 if any(stringprep.in_table_d2(x) for x in label):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000051 raise UnicodeError("Violation of BIDI requirement 2")
Martin v. Löwis2548c732003-04-18 10:39:54 +000052
53 # 3) If a string contains any RandALCat character, a
54 # RandALCat character MUST be the first character of the
55 # string, and a RandALCat character MUST be the last
56 # character of the string.
57 if not RandAL[0] or not RandAL[-1]:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058 raise UnicodeError("Violation of BIDI requirement 3")
Martin v. Löwis2548c732003-04-18 10:39:54 +000059
60 return label
61
62def ToASCII(label):
63 try:
64 # Step 1: try ASCII
65 label = label.encode("ascii")
66 except UnicodeError:
67 pass
68 else:
69 # Skip to step 3: UseSTD3ASCIIRules is false, so
70 # Skip to step 8.
71 if 0 < len(label) < 64:
72 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000073 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000074
75 # Step 2: nameprep
76 label = nameprep(label)
77
78 # Step 3: UseSTD3ASCIIRules is false
79 # Step 4: try ASCII
80 try:
81 label = label.encode("ascii")
82 except UnicodeError:
83 pass
84 else:
85 # Skip to step 8.
86 if 0 < len(label) < 64:
87 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000089
90 # Step 5: Check ACE prefix
Walter Dörwald0ac30f82007-05-11 10:32:57 +000091 if label.startswith(sace_prefix):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092 raise UnicodeError("Label starts with ACE prefix")
Martin v. Löwis2548c732003-04-18 10:39:54 +000093
94 # Step 6: Encode with PUNYCODE
95 label = label.encode("punycode")
96
97 # Step 7: Prepend ACE prefix
98 label = ace_prefix + label
99
100 # Step 8: Check size
101 if 0 < len(label) < 64:
102 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000103 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000104
105def ToUnicode(label):
106 # Step 1: Check for ASCII
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000107 if isinstance(label, bytes):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000108 pure_ascii = True
109 else:
110 try:
111 label = label.encode("ascii")
112 pure_ascii = True
113 except UnicodeError:
114 pure_ascii = False
115 if not pure_ascii:
116 # Step 2: Perform nameprep
117 label = nameprep(label)
118 # It doesn't say this, but apparently, it should be ASCII now
119 try:
120 label = label.encode("ascii")
121 except UnicodeError:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000122 raise UnicodeError("Invalid character in IDN label")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000123 # Step 3: Check for ACE prefix
124 if not label.startswith(ace_prefix):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 return str(label, "ascii")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000126
127 # Step 4: Remove ACE prefix
128 label1 = label[len(ace_prefix):]
129
130 # Step 5: Decode using PUNYCODE
131 result = label1.decode("punycode")
132
133 # Step 6: Apply ToASCII
134 label2 = ToASCII(result)
135
136 # Step 7: Compare the result of step 6 with the one of step 3
137 # label2 will already be in lower case.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000138 if str(label, "ascii").lower() != str(label2, "ascii"):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000139 raise UnicodeError("IDNA does not round-trip", label, label2)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000140
141 # Step 8: return the result of step 5
142 return result
Tim Peters0eadaac2003-04-24 16:02:54 +0000143
Martin v. Löwis2548c732003-04-18 10:39:54 +0000144### Codec APIs
145
146class Codec(codecs.Codec):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000147 def encode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000148
149 if errors != 'strict':
150 # IDNA is quite clear that implementations must be strict
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000151 raise UnicodeError("unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000152
Martin v. Löwis8b595142005-08-25 11:03:38 +0000153 if not input:
Guido van Rossum98297ee2007-11-06 21:34:58 +0000154 return b'', 0
Martin v. Löwis8b595142005-08-25 11:03:38 +0000155
Antoine Pitrou97686762011-11-10 22:49:20 +0100156 try:
157 result = input.encode('ascii')
158 except UnicodeEncodeError:
159 pass
160 else:
161 # ASCII name: fast path
162 labels = result.split(b'.')
163 for label in labels[:-1]:
164 if not (0 < len(label) < 64):
165 raise UnicodeError("label empty or too long")
166 if len(labels[-1]) >= 64:
167 raise UnicodeError("label too long")
168 return result, len(input)
169
Guido van Rossum254348e2007-11-21 19:29:53 +0000170 result = bytearray()
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000171 labels = dots.split(input)
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000172 if labels and not labels[-1]:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000173 trailing_dot = b'.'
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000174 del labels[-1]
175 else:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000176 trailing_dot = b''
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000177 for label in labels:
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000178 if result:
179 # Join with U+002E
180 result.extend(b'.')
181 result.extend(ToASCII(label))
Guido van Rossum98297ee2007-11-06 21:34:58 +0000182 return bytes(result+trailing_dot), len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000183
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000184 def decode(self, input, errors='strict'):
Tim Peters0eadaac2003-04-24 16:02:54 +0000185
Martin v. Löwis2548c732003-04-18 10:39:54 +0000186 if errors != 'strict':
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000188
Martin v. Löwis8b595142005-08-25 11:03:38 +0000189 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000190 return "", 0
Martin v. Löwis8b595142005-08-25 11:03:38 +0000191
Martin v. Löwis2548c732003-04-18 10:39:54 +0000192 # IDNA allows decoding to operate on Unicode strings, too.
Antoine Pitroufd036452008-08-19 17:56:33 +0000193 if not isinstance(input, bytes):
194 # XXX obviously wrong, see #3232
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000195 input = bytes(input)
Antoine Pitrou97686762011-11-10 22:49:20 +0100196
197 if ace_prefix not in input:
198 # Fast path
199 try:
200 return input.decode('ascii'), len(input)
201 except UnicodeDecodeError:
202 pass
203
Antoine Pitroufd036452008-08-19 17:56:33 +0000204 labels = input.split(b".")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000205
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000206 if labels and len(labels[-1]) == 0:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000207 trailing_dot = '.'
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000208 del labels[-1]
209 else:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000210 trailing_dot = ''
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000211
Martin v. Löwis2548c732003-04-18 10:39:54 +0000212 result = []
213 for label in labels:
214 result.append(ToUnicode(label))
215
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000216 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
219 def _buffer_encode(self, input, errors, final):
220 if errors != 'strict':
221 # IDNA is quite clear that implementations must be strict
222 raise UnicodeError("unsupported error handling "+errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000223
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000224 if not input:
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000225 return (b'', 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000226
227 labels = dots.split(input)
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000228 trailing_dot = b''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229 if labels:
230 if not labels[-1]:
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000231 trailing_dot = b'.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232 del labels[-1]
233 elif not final:
234 # Keep potentially unfinished label until the next call
235 del labels[-1]
236 if labels:
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000237 trailing_dot = b'.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000238
Guido van Rossum254348e2007-11-21 19:29:53 +0000239 result = bytearray()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 size = 0
241 for label in labels:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000242 if size:
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000243 # Join with U+002E
244 result.extend(b'.')
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000245 size += 1
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000246 result.extend(ToASCII(label))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 size += len(label)
248
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000249 result += trailing_dot
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250 size += len(trailing_dot)
Guido van Rossum98297ee2007-11-06 21:34:58 +0000251 return (bytes(result), size)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252
253class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
254 def _buffer_decode(self, input, errors, final):
255 if errors != 'strict':
256 raise UnicodeError("Unsupported error handling "+errors)
257
258 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000259 return ("", 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260
261 # IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000262 if isinstance(input, str):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 labels = dots.split(input)
264 else:
265 # Must be ASCII string
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000266 input = str(input, "ascii")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000267 labels = input.split(".")
268
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000269 trailing_dot = ''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000270 if labels:
271 if not labels[-1]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000272 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273 del labels[-1]
274 elif not final:
275 # Keep potentially unfinished label until the next call
276 del labels[-1]
277 if labels:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 result = []
281 size = 0
282 for label in labels:
283 result.append(ToUnicode(label))
284 if size:
285 size += 1
286 size += len(label)
287
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000288 result = ".".join(result) + trailing_dot
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 size += len(trailing_dot)
290 return (result, size)
Thomas Woutersa9773292006-04-21 09:43:23 +0000291
Martin v. Löwis2548c732003-04-18 10:39:54 +0000292class StreamWriter(Codec,codecs.StreamWriter):
293 pass
294
295class StreamReader(Codec,codecs.StreamReader):
296 pass
297
298### encodings module API
299
300def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000301 return codecs.CodecInfo(
302 name='idna',
303 encode=Codec().encode,
304 decode=Codec().decode,
305 incrementalencoder=IncrementalEncoder,
306 incrementaldecoder=IncrementalDecoder,
307 streamwriter=StreamWriter,
308 streamreader=StreamReader,
309 )