Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
| 2 | |
Martin v. Löwis | 480f1bb | 2006-03-09 23:38:20 +0000 | [diff] [blame] | 3 | import stringprep, re, codecs |
Martin v. Löwis | 5bd7c02 | 2006-03-10 11:20:04 +0000 | [diff] [blame] | 4 | from unicodedata import ucd_3_2_0 as unicodedata |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 5 | |
| 6 | # IDNA section 3.1 |
| 7 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") |
| 8 | |
| 9 | # IDNA section 5 |
| 10 | ace_prefix = "xn--" |
| 11 | uace_prefix = unicode(ace_prefix, "ascii") |
| 12 | |
| 13 | # This assumes query strings, so AllowUnassigned is true |
| 14 | def nameprep(label): |
| 15 | # Map |
| 16 | newlabel = [] |
| 17 | for c in label: |
| 18 | if stringprep.in_table_b1(c): |
| 19 | # Map to nothing |
| 20 | continue |
| 21 | newlabel.append(stringprep.map_table_b2(c)) |
| 22 | label = u"".join(newlabel) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 23 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 24 | # Normalize |
| 25 | label = unicodedata.normalize("NFKC", label) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 26 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 27 | # Prohibit |
| 28 | for c in label: |
| 29 | if stringprep.in_table_c12(c) or \ |
| 30 | stringprep.in_table_c22(c) or \ |
| 31 | stringprep.in_table_c3(c) or \ |
| 32 | stringprep.in_table_c4(c) or \ |
| 33 | stringprep.in_table_c5(c) or \ |
| 34 | stringprep.in_table_c6(c) or \ |
| 35 | stringprep.in_table_c7(c) or \ |
| 36 | stringprep.in_table_c8(c) or \ |
| 37 | stringprep.in_table_c9(c): |
| 38 | raise UnicodeError, "Invalid character %s" % repr(c) |
| 39 | |
| 40 | # Check bidi |
| 41 | RandAL = map(stringprep.in_table_d1, label) |
| 42 | for c in RandAL: |
| 43 | if c: |
| 44 | # There is a RandAL char in the string. Must perform further |
| 45 | # tests: |
| 46 | # 1) The characters in section 5.8 MUST be prohibited. |
| 47 | # This is table C.8, which was already checked |
| 48 | # 2) If a string contains any RandALCat character, the string |
| 49 | # MUST NOT contain any LCat character. |
| 50 | if filter(stringprep.in_table_d2, label): |
| 51 | raise UnicodeError, "Violation of BIDI requirement 2" |
| 52 | |
| 53 | # 3) If a string contains any RandALCat character, a |
| 54 | # RandALCat character MUST be the first character of the |
| 55 | # string, and a RandALCat character MUST be the last |
| 56 | # character of the string. |
| 57 | if not RandAL[0] or not RandAL[-1]: |
| 58 | raise UnicodeError, "Violation of BIDI requirement 3" |
| 59 | |
| 60 | return label |
| 61 | |
| 62 | def ToASCII(label): |
| 63 | try: |
| 64 | # Step 1: try ASCII |
| 65 | label = label.encode("ascii") |
| 66 | except UnicodeError: |
| 67 | pass |
| 68 | else: |
| 69 | # Skip to step 3: UseSTD3ASCIIRules is false, so |
| 70 | # Skip to step 8. |
| 71 | if 0 < len(label) < 64: |
| 72 | return label |
| 73 | raise UnicodeError, "label too long" |
| 74 | |
| 75 | # Step 2: nameprep |
| 76 | label = nameprep(label) |
| 77 | |
| 78 | # Step 3: UseSTD3ASCIIRules is false |
| 79 | # Step 4: try ASCII |
| 80 | try: |
| 81 | label = label.encode("ascii") |
| 82 | except UnicodeError: |
| 83 | pass |
| 84 | else: |
| 85 | # Skip to step 8. |
| 86 | if 0 < len(label) < 64: |
| 87 | return label |
| 88 | raise UnicodeError, "label too long" |
| 89 | |
| 90 | # Step 5: Check ACE prefix |
| 91 | if label.startswith(uace_prefix): |
| 92 | raise UnicodeError, "Label starts with ACE prefix" |
| 93 | |
| 94 | # Step 6: Encode with PUNYCODE |
| 95 | label = label.encode("punycode") |
| 96 | |
| 97 | # Step 7: Prepend ACE prefix |
| 98 | label = ace_prefix + label |
| 99 | |
| 100 | # Step 8: Check size |
| 101 | if 0 < len(label) < 64: |
| 102 | return label |
| 103 | raise UnicodeError, "label too long" |
| 104 | |
| 105 | def ToUnicode(label): |
| 106 | # Step 1: Check for ASCII |
| 107 | if isinstance(label, str): |
| 108 | pure_ascii = True |
| 109 | else: |
| 110 | try: |
| 111 | label = label.encode("ascii") |
| 112 | pure_ascii = True |
| 113 | except UnicodeError: |
| 114 | pure_ascii = False |
| 115 | if not pure_ascii: |
| 116 | # Step 2: Perform nameprep |
| 117 | label = nameprep(label) |
| 118 | # It doesn't say this, but apparently, it should be ASCII now |
| 119 | try: |
| 120 | label = label.encode("ascii") |
| 121 | except UnicodeError: |
| 122 | raise UnicodeError, "Invalid character in IDN label" |
| 123 | # Step 3: Check for ACE prefix |
| 124 | if not label.startswith(ace_prefix): |
| 125 | return unicode(label, "ascii") |
| 126 | |
| 127 | # Step 4: Remove ACE prefix |
| 128 | label1 = label[len(ace_prefix):] |
| 129 | |
| 130 | # Step 5: Decode using PUNYCODE |
| 131 | result = label1.decode("punycode") |
| 132 | |
| 133 | # Step 6: Apply ToASCII |
| 134 | label2 = ToASCII(result) |
| 135 | |
| 136 | # Step 7: Compare the result of step 6 with the one of step 3 |
| 137 | # label2 will already be in lower case. |
| 138 | if label.lower() != label2: |
| 139 | raise UnicodeError, ("IDNA does not round-trip", label, label2) |
| 140 | |
| 141 | # Step 8: return the result of step 5 |
| 142 | return result |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 143 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 144 | ### Codec APIs |
| 145 | |
| 146 | class Codec(codecs.Codec): |
| 147 | def encode(self,input,errors='strict'): |
| 148 | |
| 149 | if errors != 'strict': |
| 150 | # IDNA is quite clear that implementations must be strict |
| 151 | raise UnicodeError, "unsupported error handling "+errors |
| 152 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 153 | if not input: |
| 154 | return "", 0 |
| 155 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 156 | result = [] |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 157 | labels = dots.split(input) |
| 158 | if labels and len(labels[-1])==0: |
| 159 | trailing_dot = '.' |
| 160 | del labels[-1] |
| 161 | else: |
| 162 | trailing_dot = '' |
| 163 | for label in labels: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 164 | result.append(ToASCII(label)) |
| 165 | # Join with U+002E |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 166 | return ".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 167 | |
| 168 | def decode(self,input,errors='strict'): |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 169 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 170 | if errors != 'strict': |
| 171 | raise UnicodeError, "Unsupported error handling "+errors |
| 172 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 173 | if not input: |
| 174 | return u"", 0 |
| 175 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 176 | # IDNA allows decoding to operate on Unicode strings, too. |
| 177 | if isinstance(input, unicode): |
| 178 | labels = dots.split(input) |
| 179 | else: |
| 180 | # Must be ASCII string |
Martin v. Löwis | 708b4da | 2004-03-23 23:40:36 +0000 | [diff] [blame] | 181 | input = str(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 182 | unicode(input, "ascii") |
| 183 | labels = input.split(".") |
| 184 | |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 185 | if labels and len(labels[-1]) == 0: |
| 186 | trailing_dot = u'.' |
| 187 | del labels[-1] |
| 188 | else: |
| 189 | trailing_dot = u'' |
| 190 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 191 | result = [] |
| 192 | for label in labels: |
| 193 | result.append(ToUnicode(label)) |
| 194 | |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 195 | return u".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 196 | |
Walter Dörwald | abb02e5 | 2006-03-15 11:35:15 +0000 | [diff] [blame^] | 197 | class IncrementalEncoder(codecs.IncrementalEncoder): |
| 198 | def encode(self, input, final=False): |
| 199 | return Codec().encode(input, self.errors)[0] |
| 200 | |
| 201 | class IncrementalDecoder(codecs.IncrementalDecoder): |
| 202 | def decode(self, input, final=False): |
| 203 | return Codec().decode(input, self.errors)[0] |
| 204 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 205 | class StreamWriter(Codec,codecs.StreamWriter): |
| 206 | pass |
| 207 | |
| 208 | class StreamReader(Codec,codecs.StreamReader): |
| 209 | pass |
| 210 | |
| 211 | ### encodings module API |
| 212 | |
| 213 | def getregentry(): |
Walter Dörwald | abb02e5 | 2006-03-15 11:35:15 +0000 | [diff] [blame^] | 214 | return codecs.CodecInfo( |
| 215 | name='idna', |
| 216 | encode=Codec().encode, |
| 217 | decode=Codec().decode, |
| 218 | incrementalencoder=IncrementalEncoder, |
| 219 | incrementaldecoder=IncrementalDecoder, |
| 220 | streamwriter=StreamWriter, |
| 221 | streamreader=StreamReader, |
| 222 | ) |