blob: b226d22fda56a591cc8dfef44512c7cc3476164e [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003import stringprep, re, codecs
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00004from unicodedata import ucd_3_2_0 as unicodedata
Martin v. Löwis2548c732003-04-18 10:39:54 +00005
6# IDNA section 3.1
Guido van Rossumef87d6e2007-05-02 19:09:54 +00007dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis2548c732003-04-18 10:39:54 +00008
9# IDNA section 5
10ace_prefix = "xn--"
Martin v. Löwis2548c732003-04-18 10:39:54 +000011
12# This assumes query strings, so AllowUnassigned is true
13def nameprep(label):
14 # Map
15 newlabel = []
16 for c in label:
17 if stringprep.in_table_b1(c):
18 # Map to nothing
19 continue
20 newlabel.append(stringprep.map_table_b2(c))
Guido van Rossumef87d6e2007-05-02 19:09:54 +000021 label = "".join(newlabel)
Tim Peters0eadaac2003-04-24 16:02:54 +000022
Martin v. Löwis2548c732003-04-18 10:39:54 +000023 # Normalize
24 label = unicodedata.normalize("NFKC", label)
Tim Peters0eadaac2003-04-24 16:02:54 +000025
Martin v. Löwis2548c732003-04-18 10:39:54 +000026 # Prohibit
27 for c in label:
28 if stringprep.in_table_c12(c) or \
29 stringprep.in_table_c22(c) or \
30 stringprep.in_table_c3(c) or \
31 stringprep.in_table_c4(c) or \
32 stringprep.in_table_c5(c) or \
33 stringprep.in_table_c6(c) or \
34 stringprep.in_table_c7(c) or \
35 stringprep.in_table_c8(c) or \
36 stringprep.in_table_c9(c):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000037 raise UnicodeError("Invalid character %r" % c)
Martin v. Löwis2548c732003-04-18 10:39:54 +000038
39 # Check bidi
40 RandAL = map(stringprep.in_table_d1, label)
41 for c in RandAL:
42 if c:
43 # There is a RandAL char in the string. Must perform further
44 # tests:
45 # 1) The characters in section 5.8 MUST be prohibited.
46 # This is table C.8, which was already checked
47 # 2) If a string contains any RandALCat character, the string
48 # MUST NOT contain any LCat character.
49 if filter(stringprep.in_table_d2, label):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000050 raise UnicodeError("Violation of BIDI requirement 2")
Martin v. Löwis2548c732003-04-18 10:39:54 +000051
52 # 3) If a string contains any RandALCat character, a
53 # RandALCat character MUST be the first character of the
54 # string, and a RandALCat character MUST be the last
55 # character of the string.
56 if not RandAL[0] or not RandAL[-1]:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057 raise UnicodeError("Violation of BIDI requirement 3")
Martin v. Löwis2548c732003-04-18 10:39:54 +000058
59 return label
60
61def ToASCII(label):
62 try:
63 # Step 1: try ASCII
64 label = label.encode("ascii")
65 except UnicodeError:
66 pass
67 else:
68 # Skip to step 3: UseSTD3ASCIIRules is false, so
69 # Skip to step 8.
70 if 0 < len(label) < 64:
71 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000072 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000073
74 # Step 2: nameprep
75 label = nameprep(label)
76
77 # Step 3: UseSTD3ASCIIRules is false
78 # Step 4: try ASCII
79 try:
80 label = label.encode("ascii")
81 except UnicodeError:
82 pass
83 else:
84 # Skip to step 8.
85 if 0 < len(label) < 64:
86 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000088
89 # Step 5: Check ACE prefix
90 if label.startswith(uace_prefix):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091 raise UnicodeError("Label starts with ACE prefix")
Martin v. Löwis2548c732003-04-18 10:39:54 +000092
93 # Step 6: Encode with PUNYCODE
94 label = label.encode("punycode")
95
96 # Step 7: Prepend ACE prefix
97 label = ace_prefix + label
98
99 # Step 8: Check size
100 if 0 < len(label) < 64:
101 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000102 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000103
104def ToUnicode(label):
105 # Step 1: Check for ASCII
106 if isinstance(label, str):
107 pure_ascii = True
108 else:
109 try:
110 label = label.encode("ascii")
111 pure_ascii = True
112 except UnicodeError:
113 pure_ascii = False
114 if not pure_ascii:
115 # Step 2: Perform nameprep
116 label = nameprep(label)
117 # It doesn't say this, but apparently, it should be ASCII now
118 try:
119 label = label.encode("ascii")
120 except UnicodeError:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000121 raise UnicodeError("Invalid character in IDN label")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000122 # Step 3: Check for ACE prefix
123 if not label.startswith(ace_prefix):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 return str(label, "ascii")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000125
126 # Step 4: Remove ACE prefix
127 label1 = label[len(ace_prefix):]
128
129 # Step 5: Decode using PUNYCODE
130 result = label1.decode("punycode")
131
132 # Step 6: Apply ToASCII
133 label2 = ToASCII(result)
134
135 # Step 7: Compare the result of step 6 with the one of step 3
136 # label2 will already be in lower case.
137 if label.lower() != label2:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000138 raise UnicodeError("IDNA does not round-trip", label, label2)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000139
140 # Step 8: return the result of step 5
141 return result
Tim Peters0eadaac2003-04-24 16:02:54 +0000142
Martin v. Löwis2548c732003-04-18 10:39:54 +0000143### Codec APIs
144
145class Codec(codecs.Codec):
146 def encode(self,input,errors='strict'):
147
148 if errors != 'strict':
149 # IDNA is quite clear that implementations must be strict
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000150 raise UnicodeError("unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000151
Martin v. Löwis8b595142005-08-25 11:03:38 +0000152 if not input:
153 return "", 0
154
Martin v. Löwis2548c732003-04-18 10:39:54 +0000155 result = []
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000156 labels = dots.split(input)
157 if labels and len(labels[-1])==0:
158 trailing_dot = '.'
159 del labels[-1]
160 else:
161 trailing_dot = ''
162 for label in labels:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000163 result.append(ToASCII(label))
164 # Join with U+002E
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000165 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000166
167 def decode(self,input,errors='strict'):
Tim Peters0eadaac2003-04-24 16:02:54 +0000168
Martin v. Löwis2548c732003-04-18 10:39:54 +0000169 if errors != 'strict':
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000170 raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000171
Martin v. Löwis8b595142005-08-25 11:03:38 +0000172 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 return "", 0
Martin v. Löwis8b595142005-08-25 11:03:38 +0000174
Martin v. Löwis2548c732003-04-18 10:39:54 +0000175 # IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000176 if isinstance(input, str):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000177 labels = dots.split(input)
178 else:
179 # Must be ASCII string
Martin v. Löwis708b4da2004-03-23 23:40:36 +0000180 input = str(input)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000181 str(input, "ascii")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000182 labels = input.split(".")
183
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000184 if labels and len(labels[-1]) == 0:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000185 trailing_dot = '.'
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000186 del labels[-1]
187 else:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000188 trailing_dot = ''
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000189
Martin v. Löwis2548c732003-04-18 10:39:54 +0000190 result = []
191 for label in labels:
192 result.append(ToUnicode(label))
193
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000194 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
197 def _buffer_encode(self, input, errors, final):
198 if errors != 'strict':
199 # IDNA is quite clear that implementations must be strict
200 raise UnicodeError("unsupported error handling "+errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000201
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000202 if not input:
203 return ("", 0)
204
205 labels = dots.split(input)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000206 trailing_dot = ''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000207 if labels:
208 if not labels[-1]:
209 trailing_dot = '.'
210 del labels[-1]
211 elif not final:
212 # Keep potentially unfinished label until the next call
213 del labels[-1]
214 if labels:
215 trailing_dot = '.'
216
217 result = []
218 size = 0
219 for label in labels:
220 result.append(ToASCII(label))
221 if size:
222 size += 1
223 size += len(label)
224
225 # Join with U+002E
226 result = ".".join(result) + trailing_dot
227 size += len(trailing_dot)
228 return (result, size)
229
230class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
231 def _buffer_decode(self, input, errors, final):
232 if errors != 'strict':
233 raise UnicodeError("Unsupported error handling "+errors)
234
235 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000236 return ("", 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237
238 # IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 if isinstance(input, str):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 labels = dots.split(input)
241 else:
242 # Must be ASCII string
243 input = str(input)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000244 str(input, "ascii")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000245 labels = input.split(".")
246
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 trailing_dot = ''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000248 if labels:
249 if not labels[-1]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000250 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251 del labels[-1]
252 elif not final:
253 # Keep potentially unfinished label until the next call
254 del labels[-1]
255 if labels:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257
258 result = []
259 size = 0
260 for label in labels:
261 result.append(ToUnicode(label))
262 if size:
263 size += 1
264 size += len(label)
265
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000266 result = ".".join(result) + trailing_dot
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000267 size += len(trailing_dot)
268 return (result, size)
Thomas Woutersa9773292006-04-21 09:43:23 +0000269
Martin v. Löwis2548c732003-04-18 10:39:54 +0000270class StreamWriter(Codec,codecs.StreamWriter):
271 pass
272
273class StreamReader(Codec,codecs.StreamReader):
274 pass
275
276### encodings module API
277
278def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000279 return codecs.CodecInfo(
280 name='idna',
281 encode=Codec().encode,
282 decode=Codec().decode,
283 incrementalencoder=IncrementalEncoder,
284 incrementaldecoder=IncrementalDecoder,
285 streamwriter=StreamWriter,
286 streamreader=StreamReader,
287 )