Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
| 2 | |
Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 3 | import stringprep, re, codecs |
Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 4 | from unicodedata import ucd_3_2_0 as unicodedata |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 5 | |
| 6 | # IDNA section 3.1 |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 7 | dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 8 | |
| 9 | # IDNA section 5 |
| 10 | ace_prefix = "xn--" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 11 | |
| 12 | # This assumes query strings, so AllowUnassigned is true |
| 13 | def nameprep(label): |
| 14 | # Map |
| 15 | newlabel = [] |
| 16 | for c in label: |
| 17 | if stringprep.in_table_b1(c): |
| 18 | # Map to nothing |
| 19 | continue |
| 20 | newlabel.append(stringprep.map_table_b2(c)) |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 21 | label = "".join(newlabel) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 22 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 23 | # Normalize |
| 24 | label = unicodedata.normalize("NFKC", label) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 25 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 26 | # Prohibit |
| 27 | for c in label: |
| 28 | if stringprep.in_table_c12(c) or \ |
| 29 | stringprep.in_table_c22(c) or \ |
| 30 | stringprep.in_table_c3(c) or \ |
| 31 | stringprep.in_table_c4(c) or \ |
| 32 | stringprep.in_table_c5(c) or \ |
| 33 | stringprep.in_table_c6(c) or \ |
| 34 | stringprep.in_table_c7(c) or \ |
| 35 | stringprep.in_table_c8(c) or \ |
| 36 | stringprep.in_table_c9(c): |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 37 | raise UnicodeError("Invalid character %r" % c) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 38 | |
| 39 | # Check bidi |
| 40 | RandAL = map(stringprep.in_table_d1, label) |
| 41 | for c in RandAL: |
| 42 | if c: |
| 43 | # There is a RandAL char in the string. Must perform further |
| 44 | # tests: |
| 45 | # 1) The characters in section 5.8 MUST be prohibited. |
| 46 | # This is table C.8, which was already checked |
| 47 | # 2) If a string contains any RandALCat character, the string |
| 48 | # MUST NOT contain any LCat character. |
| 49 | if filter(stringprep.in_table_d2, label): |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 50 | raise UnicodeError("Violation of BIDI requirement 2") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 51 | |
| 52 | # 3) If a string contains any RandALCat character, a |
| 53 | # RandALCat character MUST be the first character of the |
| 54 | # string, and a RandALCat character MUST be the last |
| 55 | # character of the string. |
| 56 | if not RandAL[0] or not RandAL[-1]: |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 57 | raise UnicodeError("Violation of BIDI requirement 3") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 58 | |
| 59 | return label |
| 60 | |
| 61 | def ToASCII(label): |
| 62 | try: |
| 63 | # Step 1: try ASCII |
| 64 | label = label.encode("ascii") |
| 65 | except UnicodeError: |
| 66 | pass |
| 67 | else: |
| 68 | # Skip to step 3: UseSTD3ASCIIRules is false, so |
| 69 | # Skip to step 8. |
| 70 | if 0 < len(label) < 64: |
| 71 | return label |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 72 | raise UnicodeError("label empty or too long") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 73 | |
| 74 | # Step 2: nameprep |
| 75 | label = nameprep(label) |
| 76 | |
| 77 | # Step 3: UseSTD3ASCIIRules is false |
| 78 | # Step 4: try ASCII |
| 79 | try: |
| 80 | label = label.encode("ascii") |
| 81 | except UnicodeError: |
| 82 | pass |
| 83 | else: |
| 84 | # Skip to step 8. |
| 85 | if 0 < len(label) < 64: |
| 86 | return label |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 87 | raise UnicodeError("label empty or too long") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 88 | |
| 89 | # Step 5: Check ACE prefix |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 90 | if label.startswith(ace_prefix): |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 91 | raise UnicodeError("Label starts with ACE prefix") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 92 | |
| 93 | # Step 6: Encode with PUNYCODE |
| 94 | label = label.encode("punycode") |
| 95 | |
| 96 | # Step 7: Prepend ACE prefix |
| 97 | label = ace_prefix + label |
| 98 | |
| 99 | # Step 8: Check size |
| 100 | if 0 < len(label) < 64: |
| 101 | return label |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 102 | raise UnicodeError("label empty or too long") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 103 | |
| 104 | def ToUnicode(label): |
| 105 | # Step 1: Check for ASCII |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 106 | if isinstance(label, bytes): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 107 | pure_ascii = True |
| 108 | else: |
| 109 | try: |
| 110 | label = label.encode("ascii") |
| 111 | pure_ascii = True |
| 112 | except UnicodeError: |
| 113 | pure_ascii = False |
| 114 | if not pure_ascii: |
| 115 | # Step 2: Perform nameprep |
| 116 | label = nameprep(label) |
| 117 | # It doesn't say this, but apparently, it should be ASCII now |
| 118 | try: |
| 119 | label = label.encode("ascii") |
| 120 | except UnicodeError: |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 121 | raise UnicodeError("Invalid character in IDN label") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 122 | # Step 3: Check for ACE prefix |
| 123 | if not label.startswith(ace_prefix): |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 124 | return str(label, "ascii") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 125 | |
| 126 | # Step 4: Remove ACE prefix |
| 127 | label1 = label[len(ace_prefix):] |
| 128 | |
| 129 | # Step 5: Decode using PUNYCODE |
| 130 | result = label1.decode("punycode") |
| 131 | |
| 132 | # Step 6: Apply ToASCII |
| 133 | label2 = ToASCII(result) |
| 134 | |
| 135 | # Step 7: Compare the result of step 6 with the one of step 3 |
| 136 | # label2 will already be in lower case. |
| 137 | if label.lower() != label2: |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 138 | raise UnicodeError("IDNA does not round-trip", label, label2) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 139 | |
| 140 | # Step 8: return the result of step 5 |
| 141 | return result |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 142 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 143 | ### Codec APIs |
| 144 | |
| 145 | class Codec(codecs.Codec): |
| 146 | def encode(self,input,errors='strict'): |
| 147 | |
| 148 | if errors != 'strict': |
| 149 | # IDNA is quite clear that implementations must be strict |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 150 | raise UnicodeError("unsupported error handling "+errors) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 151 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 152 | if not input: |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 153 | return b"", 0 |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 154 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 155 | result = [] |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 156 | labels = dots.split(input) |
| 157 | if labels and len(labels[-1])==0: |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 158 | trailing_dot = b'.' |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 159 | del labels[-1] |
| 160 | else: |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 161 | trailing_dot = b'' |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 162 | for label in labels: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 163 | result.append(ToASCII(label)) |
| 164 | # Join with U+002E |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 165 | return b".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 166 | |
| 167 | def decode(self,input,errors='strict'): |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 168 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 169 | if errors != 'strict': |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 170 | raise UnicodeError("Unsupported error handling "+errors) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 171 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 172 | if not input: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 173 | return "", 0 |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 174 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 175 | # IDNA allows decoding to operate on Unicode strings, too. |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 176 | if isinstance(input, bytes): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 177 | labels = dots.split(input) |
| 178 | else: |
Guido van Rossum | 0e02abb | 2007-05-09 23:40:37 +0000 | [diff] [blame] | 179 | # Force to bytes |
| 180 | input = bytes(input) |
| 181 | labels = input.split(b".") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 182 | |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 183 | if labels and len(labels[-1]) == 0: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 184 | trailing_dot = '.' |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 185 | del labels[-1] |
| 186 | else: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 187 | trailing_dot = '' |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 188 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 189 | result = [] |
| 190 | for label in labels: |
| 191 | result.append(ToUnicode(label)) |
| 192 | |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 193 | return ".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 194 | |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 195 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder): |
| 196 | def _buffer_encode(self, input, errors, final): |
| 197 | if errors != 'strict': |
| 198 | # IDNA is quite clear that implementations must be strict |
| 199 | raise UnicodeError("unsupported error handling "+errors) |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 200 | |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 201 | if not input: |
| 202 | return ("", 0) |
| 203 | |
| 204 | labels = dots.split(input) |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 205 | trailing_dot = '' |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 206 | if labels: |
| 207 | if not labels[-1]: |
| 208 | trailing_dot = '.' |
| 209 | del labels[-1] |
| 210 | elif not final: |
| 211 | # Keep potentially unfinished label until the next call |
| 212 | del labels[-1] |
| 213 | if labels: |
| 214 | trailing_dot = '.' |
| 215 | |
| 216 | result = [] |
| 217 | size = 0 |
| 218 | for label in labels: |
| 219 | result.append(ToASCII(label)) |
| 220 | if size: |
| 221 | size += 1 |
| 222 | size += len(label) |
| 223 | |
| 224 | # Join with U+002E |
| 225 | result = ".".join(result) + trailing_dot |
| 226 | size += len(trailing_dot) |
| 227 | return (result, size) |
| 228 | |
| 229 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder): |
| 230 | def _buffer_decode(self, input, errors, final): |
| 231 | if errors != 'strict': |
| 232 | raise UnicodeError("Unsupported error handling "+errors) |
| 233 | |
| 234 | if not input: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 235 | return ("", 0) |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 236 | |
| 237 | # IDNA allows decoding to operate on Unicode strings, too. |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 238 | if isinstance(input, str): |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 239 | labels = dots.split(input) |
| 240 | else: |
| 241 | # Must be ASCII string |
| 242 | input = str(input) |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 243 | str(input, "ascii") |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 244 | labels = input.split(".") |
| 245 | |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 246 | trailing_dot = '' |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 247 | if labels: |
| 248 | if not labels[-1]: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 249 | trailing_dot = '.' |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 250 | del labels[-1] |
| 251 | elif not final: |
| 252 | # Keep potentially unfinished label until the next call |
| 253 | del labels[-1] |
| 254 | if labels: |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 255 | trailing_dot = '.' |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 256 | |
| 257 | result = [] |
| 258 | size = 0 |
| 259 | for label in labels: |
| 260 | result.append(ToUnicode(label)) |
| 261 | if size: |
| 262 | size += 1 |
| 263 | size += len(label) |
| 264 | |
Guido van Rossum | ef87d6e | 2007-05-02 19:09:54 +0000 | [diff] [blame] | 265 | result = ".".join(result) + trailing_dot |
Thomas Wouters | 49fd7fa | 2006-04-21 10:40:58 +0000 | [diff] [blame] | 266 | size += len(trailing_dot) |
| 267 | return (result, size) |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 268 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 269 | class StreamWriter(Codec,codecs.StreamWriter): |
| 270 | pass |
| 271 | |
| 272 | class StreamReader(Codec,codecs.StreamReader): |
| 273 | pass |
| 274 | |
| 275 | ### encodings module API |
| 276 | |
| 277 | def getregentry(): |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 278 | return codecs.CodecInfo( |
| 279 | name='idna', |
| 280 | encode=Codec().encode, |
| 281 | decode=Codec().decode, |
| 282 | incrementalencoder=IncrementalEncoder, |
| 283 | incrementaldecoder=IncrementalDecoder, |
| 284 | streamwriter=StreamWriter, |
| 285 | streamreader=StreamReader, |
| 286 | ) |