blob: 5c3d05663e88c18a6d052c772bb10a7f59a414ff [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003import stringprep, re, codecs
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00004from unicodedata import ucd_3_2_0 as unicodedata
Martin v. Löwis2548c732003-04-18 10:39:54 +00005
6# IDNA section 3.1
Guido van Rossumef87d6e2007-05-02 19:09:54 +00007dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis2548c732003-04-18 10:39:54 +00008
9# IDNA section 5
10ace_prefix = "xn--"
Martin v. Löwis2548c732003-04-18 10:39:54 +000011
12# This assumes query strings, so AllowUnassigned is true
13def nameprep(label):
14 # Map
15 newlabel = []
16 for c in label:
17 if stringprep.in_table_b1(c):
18 # Map to nothing
19 continue
20 newlabel.append(stringprep.map_table_b2(c))
Guido van Rossumef87d6e2007-05-02 19:09:54 +000021 label = "".join(newlabel)
Tim Peters0eadaac2003-04-24 16:02:54 +000022
Martin v. Löwis2548c732003-04-18 10:39:54 +000023 # Normalize
24 label = unicodedata.normalize("NFKC", label)
Tim Peters0eadaac2003-04-24 16:02:54 +000025
Martin v. Löwis2548c732003-04-18 10:39:54 +000026 # Prohibit
27 for c in label:
28 if stringprep.in_table_c12(c) or \
29 stringprep.in_table_c22(c) or \
30 stringprep.in_table_c3(c) or \
31 stringprep.in_table_c4(c) or \
32 stringprep.in_table_c5(c) or \
33 stringprep.in_table_c6(c) or \
34 stringprep.in_table_c7(c) or \
35 stringprep.in_table_c8(c) or \
36 stringprep.in_table_c9(c):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000037 raise UnicodeError("Invalid character %r" % c)
Martin v. Löwis2548c732003-04-18 10:39:54 +000038
39 # Check bidi
40 RandAL = map(stringprep.in_table_d1, label)
41 for c in RandAL:
42 if c:
43 # There is a RandAL char in the string. Must perform further
44 # tests:
45 # 1) The characters in section 5.8 MUST be prohibited.
46 # This is table C.8, which was already checked
47 # 2) If a string contains any RandALCat character, the string
48 # MUST NOT contain any LCat character.
49 if filter(stringprep.in_table_d2, label):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000050 raise UnicodeError("Violation of BIDI requirement 2")
Martin v. Löwis2548c732003-04-18 10:39:54 +000051
52 # 3) If a string contains any RandALCat character, a
53 # RandALCat character MUST be the first character of the
54 # string, and a RandALCat character MUST be the last
55 # character of the string.
56 if not RandAL[0] or not RandAL[-1]:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057 raise UnicodeError("Violation of BIDI requirement 3")
Martin v. Löwis2548c732003-04-18 10:39:54 +000058
59 return label
60
61def ToASCII(label):
62 try:
63 # Step 1: try ASCII
64 label = label.encode("ascii")
65 except UnicodeError:
66 pass
67 else:
68 # Skip to step 3: UseSTD3ASCIIRules is false, so
69 # Skip to step 8.
70 if 0 < len(label) < 64:
71 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000072 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000073
74 # Step 2: nameprep
75 label = nameprep(label)
76
77 # Step 3: UseSTD3ASCIIRules is false
78 # Step 4: try ASCII
79 try:
80 label = label.encode("ascii")
81 except UnicodeError:
82 pass
83 else:
84 # Skip to step 8.
85 if 0 < len(label) < 64:
86 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +000088
89 # Step 5: Check ACE prefix
Guido van Rossum0e02abb2007-05-09 23:40:37 +000090 if label.startswith(ace_prefix):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091 raise UnicodeError("Label starts with ACE prefix")
Martin v. Löwis2548c732003-04-18 10:39:54 +000092
93 # Step 6: Encode with PUNYCODE
94 label = label.encode("punycode")
95
96 # Step 7: Prepend ACE prefix
97 label = ace_prefix + label
98
99 # Step 8: Check size
100 if 0 < len(label) < 64:
101 return label
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000102 raise UnicodeError("label empty or too long")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000103
104def ToUnicode(label):
105 # Step 1: Check for ASCII
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000106 if isinstance(label, bytes):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000107 pure_ascii = True
108 else:
109 try:
110 label = label.encode("ascii")
111 pure_ascii = True
112 except UnicodeError:
113 pure_ascii = False
114 if not pure_ascii:
115 # Step 2: Perform nameprep
116 label = nameprep(label)
117 # It doesn't say this, but apparently, it should be ASCII now
118 try:
119 label = label.encode("ascii")
120 except UnicodeError:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000121 raise UnicodeError("Invalid character in IDN label")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000122 # Step 3: Check for ACE prefix
123 if not label.startswith(ace_prefix):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 return str(label, "ascii")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000125
126 # Step 4: Remove ACE prefix
127 label1 = label[len(ace_prefix):]
128
129 # Step 5: Decode using PUNYCODE
130 result = label1.decode("punycode")
131
132 # Step 6: Apply ToASCII
133 label2 = ToASCII(result)
134
135 # Step 7: Compare the result of step 6 with the one of step 3
136 # label2 will already be in lower case.
137 if label.lower() != label2:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000138 raise UnicodeError("IDNA does not round-trip", label, label2)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000139
140 # Step 8: return the result of step 5
141 return result
Tim Peters0eadaac2003-04-24 16:02:54 +0000142
Martin v. Löwis2548c732003-04-18 10:39:54 +0000143### Codec APIs
144
145class Codec(codecs.Codec):
146 def encode(self,input,errors='strict'):
147
148 if errors != 'strict':
149 # IDNA is quite clear that implementations must be strict
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000150 raise UnicodeError("unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000151
Martin v. Löwis8b595142005-08-25 11:03:38 +0000152 if not input:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000153 return b"", 0
Martin v. Löwis8b595142005-08-25 11:03:38 +0000154
Martin v. Löwis2548c732003-04-18 10:39:54 +0000155 result = []
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000156 labels = dots.split(input)
157 if labels and len(labels[-1])==0:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000158 trailing_dot = b'.'
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000159 del labels[-1]
160 else:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000161 trailing_dot = b''
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000162 for label in labels:
Martin v. Löwis2548c732003-04-18 10:39:54 +0000163 result.append(ToASCII(label))
164 # Join with U+002E
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000165 return b".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000166
167 def decode(self,input,errors='strict'):
Tim Peters0eadaac2003-04-24 16:02:54 +0000168
Martin v. Löwis2548c732003-04-18 10:39:54 +0000169 if errors != 'strict':
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000170 raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000171
Martin v. Löwis8b595142005-08-25 11:03:38 +0000172 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 return "", 0
Martin v. Löwis8b595142005-08-25 11:03:38 +0000174
Martin v. Löwis2548c732003-04-18 10:39:54 +0000175 # IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000176 if isinstance(input, bytes):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000177 labels = dots.split(input)
178 else:
Guido van Rossum0e02abb2007-05-09 23:40:37 +0000179 # Force to bytes
180 input = bytes(input)
181 labels = input.split(b".")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000182
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000183 if labels and len(labels[-1]) == 0:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000184 trailing_dot = '.'
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000185 del labels[-1]
186 else:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 trailing_dot = ''
Martin v. Löwis0d8e16c2003-08-05 06:19:47 +0000188
Martin v. Löwis2548c732003-04-18 10:39:54 +0000189 result = []
190 for label in labels:
191 result.append(ToUnicode(label))
192
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000193 return ".".join(result)+trailing_dot, len(input)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000194
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000195class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
196 def _buffer_encode(self, input, errors, final):
197 if errors != 'strict':
198 # IDNA is quite clear that implementations must be strict
199 raise UnicodeError("unsupported error handling "+errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if not input:
202 return ("", 0)
203
204 labels = dots.split(input)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000205 trailing_dot = ''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 if labels:
207 if not labels[-1]:
208 trailing_dot = '.'
209 del labels[-1]
210 elif not final:
211 # Keep potentially unfinished label until the next call
212 del labels[-1]
213 if labels:
214 trailing_dot = '.'
215
216 result = []
217 size = 0
218 for label in labels:
219 result.append(ToASCII(label))
220 if size:
221 size += 1
222 size += len(label)
223
224 # Join with U+002E
225 result = ".".join(result) + trailing_dot
226 size += len(trailing_dot)
227 return (result, size)
228
229class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
230 def _buffer_decode(self, input, errors, final):
231 if errors != 'strict':
232 raise UnicodeError("Unsupported error handling "+errors)
233
234 if not input:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000235 return ("", 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000236
237 # IDNA allows decoding to operate on Unicode strings, too.
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000238 if isinstance(input, str):
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000239 labels = dots.split(input)
240 else:
241 # Must be ASCII string
242 input = str(input)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000243 str(input, "ascii")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000244 labels = input.split(".")
245
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 trailing_dot = ''
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247 if labels:
248 if not labels[-1]:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000249 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250 del labels[-1]
251 elif not final:
252 # Keep potentially unfinished label until the next call
253 del labels[-1]
254 if labels:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000255 trailing_dot = '.'
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256
257 result = []
258 size = 0
259 for label in labels:
260 result.append(ToUnicode(label))
261 if size:
262 size += 1
263 size += len(label)
264
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000265 result = ".".join(result) + trailing_dot
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 size += len(trailing_dot)
267 return (result, size)
Thomas Woutersa9773292006-04-21 09:43:23 +0000268
Martin v. Löwis2548c732003-04-18 10:39:54 +0000269class StreamWriter(Codec,codecs.StreamWriter):
270 pass
271
272class StreamReader(Codec,codecs.StreamReader):
273 pass
274
275### encodings module API
276
277def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000278 return codecs.CodecInfo(
279 name='idna',
280 encode=Codec().encode,
281 decode=Codec().decode,
282 incrementalencoder=IncrementalEncoder,
283 incrementaldecoder=IncrementalDecoder,
284 streamwriter=StreamWriter,
285 streamreader=StreamReader,
286 )