Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
| 2 | |
| 3 | import stringprep, unicodedata, re, codecs |
| 4 | |
| 5 | # IDNA section 3.1 |
| 6 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") |
| 7 | |
| 8 | # IDNA section 5 |
| 9 | ace_prefix = "xn--" |
| 10 | uace_prefix = unicode(ace_prefix, "ascii") |
| 11 | |
| 12 | # This assumes query strings, so AllowUnassigned is true |
| 13 | def nameprep(label): |
| 14 | # Map |
| 15 | newlabel = [] |
| 16 | for c in label: |
| 17 | if stringprep.in_table_b1(c): |
| 18 | # Map to nothing |
| 19 | continue |
| 20 | newlabel.append(stringprep.map_table_b2(c)) |
| 21 | label = u"".join(newlabel) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 22 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 23 | # Normalize |
| 24 | label = unicodedata.normalize("NFKC", label) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 25 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 26 | # Prohibit |
| 27 | for c in label: |
| 28 | if stringprep.in_table_c12(c) or \ |
| 29 | stringprep.in_table_c22(c) or \ |
| 30 | stringprep.in_table_c3(c) or \ |
| 31 | stringprep.in_table_c4(c) or \ |
| 32 | stringprep.in_table_c5(c) or \ |
| 33 | stringprep.in_table_c6(c) or \ |
| 34 | stringprep.in_table_c7(c) or \ |
| 35 | stringprep.in_table_c8(c) or \ |
| 36 | stringprep.in_table_c9(c): |
| 37 | raise UnicodeError, "Invalid character %s" % repr(c) |
| 38 | |
| 39 | # Check bidi |
| 40 | RandAL = map(stringprep.in_table_d1, label) |
| 41 | for c in RandAL: |
| 42 | if c: |
| 43 | # There is a RandAL char in the string. Must perform further |
| 44 | # tests: |
| 45 | # 1) The characters in section 5.8 MUST be prohibited. |
| 46 | # This is table C.8, which was already checked |
| 47 | # 2) If a string contains any RandALCat character, the string |
| 48 | # MUST NOT contain any LCat character. |
| 49 | if filter(stringprep.in_table_d2, label): |
| 50 | raise UnicodeError, "Violation of BIDI requirement 2" |
| 51 | |
| 52 | # 3) If a string contains any RandALCat character, a |
| 53 | # RandALCat character MUST be the first character of the |
| 54 | # string, and a RandALCat character MUST be the last |
| 55 | # character of the string. |
| 56 | if not RandAL[0] or not RandAL[-1]: |
| 57 | raise UnicodeError, "Violation of BIDI requirement 3" |
| 58 | |
| 59 | return label |
| 60 | |
| 61 | def ToASCII(label): |
| 62 | try: |
| 63 | # Step 1: try ASCII |
| 64 | label = label.encode("ascii") |
| 65 | except UnicodeError: |
| 66 | pass |
| 67 | else: |
| 68 | # Skip to step 3: UseSTD3ASCIIRules is false, so |
| 69 | # Skip to step 8. |
| 70 | if 0 < len(label) < 64: |
| 71 | return label |
| 72 | raise UnicodeError, "label too long" |
| 73 | |
| 74 | # Step 2: nameprep |
| 75 | label = nameprep(label) |
| 76 | |
| 77 | # Step 3: UseSTD3ASCIIRules is false |
| 78 | # Step 4: try ASCII |
| 79 | try: |
| 80 | label = label.encode("ascii") |
| 81 | except UnicodeError: |
| 82 | pass |
| 83 | else: |
| 84 | # Skip to step 8. |
| 85 | if 0 < len(label) < 64: |
| 86 | return label |
| 87 | raise UnicodeError, "label too long" |
| 88 | |
| 89 | # Step 5: Check ACE prefix |
| 90 | if label.startswith(uace_prefix): |
| 91 | raise UnicodeError, "Label starts with ACE prefix" |
| 92 | |
| 93 | # Step 6: Encode with PUNYCODE |
| 94 | label = label.encode("punycode") |
| 95 | |
| 96 | # Step 7: Prepend ACE prefix |
| 97 | label = ace_prefix + label |
| 98 | |
| 99 | # Step 8: Check size |
| 100 | if 0 < len(label) < 64: |
| 101 | return label |
| 102 | raise UnicodeError, "label too long" |
| 103 | |
| 104 | def ToUnicode(label): |
| 105 | # Step 1: Check for ASCII |
| 106 | if isinstance(label, str): |
| 107 | pure_ascii = True |
| 108 | else: |
| 109 | try: |
| 110 | label = label.encode("ascii") |
| 111 | pure_ascii = True |
| 112 | except UnicodeError: |
| 113 | pure_ascii = False |
| 114 | if not pure_ascii: |
| 115 | # Step 2: Perform nameprep |
| 116 | label = nameprep(label) |
| 117 | # It doesn't say this, but apparently, it should be ASCII now |
| 118 | try: |
| 119 | label = label.encode("ascii") |
| 120 | except UnicodeError: |
| 121 | raise UnicodeError, "Invalid character in IDN label" |
| 122 | # Step 3: Check for ACE prefix |
| 123 | if not label.startswith(ace_prefix): |
| 124 | return unicode(label, "ascii") |
| 125 | |
| 126 | # Step 4: Remove ACE prefix |
| 127 | label1 = label[len(ace_prefix):] |
| 128 | |
| 129 | # Step 5: Decode using PUNYCODE |
| 130 | result = label1.decode("punycode") |
| 131 | |
| 132 | # Step 6: Apply ToASCII |
| 133 | label2 = ToASCII(result) |
| 134 | |
| 135 | # Step 7: Compare the result of step 6 with the one of step 3 |
| 136 | # label2 will already be in lower case. |
| 137 | if label.lower() != label2: |
| 138 | raise UnicodeError, ("IDNA does not round-trip", label, label2) |
| 139 | |
| 140 | # Step 8: return the result of step 5 |
| 141 | return result |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 142 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 143 | ### Codec APIs |
| 144 | |
| 145 | class Codec(codecs.Codec): |
| 146 | def encode(self,input,errors='strict'): |
| 147 | |
| 148 | if errors != 'strict': |
| 149 | # IDNA is quite clear that implementations must be strict |
| 150 | raise UnicodeError, "unsupported error handling "+errors |
| 151 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 152 | if not input: |
| 153 | return "", 0 |
| 154 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 155 | result = [] |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 156 | labels = dots.split(input) |
| 157 | if labels and len(labels[-1])==0: |
| 158 | trailing_dot = '.' |
| 159 | del labels[-1] |
| 160 | else: |
| 161 | trailing_dot = '' |
| 162 | for label in labels: |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 163 | result.append(ToASCII(label)) |
| 164 | # Join with U+002E |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 165 | return ".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 166 | |
| 167 | def decode(self,input,errors='strict'): |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 168 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 169 | if errors != 'strict': |
| 170 | raise UnicodeError, "Unsupported error handling "+errors |
| 171 | |
Martin v. Löwis | 8b59514 | 2005-08-25 11:03:38 +0000 | [diff] [blame] | 172 | if not input: |
| 173 | return u"", 0 |
| 174 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 175 | # IDNA allows decoding to operate on Unicode strings, too. |
| 176 | if isinstance(input, unicode): |
| 177 | labels = dots.split(input) |
| 178 | else: |
| 179 | # Must be ASCII string |
Martin v. Löwis | 708b4da | 2004-03-23 23:40:36 +0000 | [diff] [blame] | 180 | input = str(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 181 | unicode(input, "ascii") |
| 182 | labels = input.split(".") |
| 183 | |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 184 | if labels and len(labels[-1]) == 0: |
| 185 | trailing_dot = u'.' |
| 186 | del labels[-1] |
| 187 | else: |
| 188 | trailing_dot = u'' |
| 189 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 190 | result = [] |
| 191 | for label in labels: |
| 192 | result.append(ToUnicode(label)) |
| 193 | |
Martin v. Löwis | 0d8e16c | 2003-08-05 06:19:47 +0000 | [diff] [blame] | 194 | return u".".join(result)+trailing_dot, len(input) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 195 | |
| 196 | class StreamWriter(Codec,codecs.StreamWriter): |
| 197 | pass |
| 198 | |
| 199 | class StreamReader(Codec,codecs.StreamReader): |
| 200 | pass |
| 201 | |
| 202 | ### encodings module API |
| 203 | |
| 204 | def getregentry(): |
| 205 | |
| 206 | return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |