Guido van Rossum | d77d699 | 2007-07-16 23:10:57 +0000 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 2 | """ Codec for the Punicode encoding, as specified in RFC 3492 |
| 3 | |
Guido van Rossum | d77d699 | 2007-07-16 23:10:57 +0000 | [diff] [blame] | 4 | Written by Martin v. Löwis. |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 5 | """ |
| 6 | |
| 7 | import codecs |
| 8 | |
| 9 | ##################### Encoding ##################################### |
| 10 | |
| 11 | def segregate(str): |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 12 | """3.1 Basic code point segregation""" |
Guido van Rossum | 254348e | 2007-11-21 19:29:53 +0000 | [diff] [blame] | 13 | base = bytearray() |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 14 | extended = set() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 15 | for c in str: |
| 16 | if ord(c) < 128: |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 17 | base.append(ord(c)) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 18 | else: |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 19 | extended.add(c) |
| 20 | extended = sorted(extended) |
Guido van Rossum | 98297ee | 2007-11-06 21:34:58 +0000 | [diff] [blame] | 21 | return bytes(base), extended |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 22 | |
| 23 | def selective_len(str, max): |
| 24 | """Return the length of str, considering only characters below max.""" |
| 25 | res = 0 |
| 26 | for c in str: |
| 27 | if ord(c) < max: |
| 28 | res += 1 |
| 29 | return res |
| 30 | |
| 31 | def selective_find(str, char, index, pos): |
| 32 | """Return a pair (index, pos), indicating the next occurrence of |
| 33 | char in str. index is the position of the character considering |
| 34 | only ordinals up to and including char, and pos is the position in |
| 35 | the full string. index/pos is the starting position in the full |
| 36 | string.""" |
| 37 | |
| 38 | l = len(str) |
| 39 | while 1: |
| 40 | pos += 1 |
| 41 | if pos == l: |
| 42 | return (-1, -1) |
| 43 | c = str[pos] |
| 44 | if c == char: |
| 45 | return index+1, pos |
| 46 | elif c < char: |
| 47 | index += 1 |
| 48 | |
| 49 | def insertion_unsort(str, extended): |
| 50 | """3.2 Insertion unsort coding""" |
| 51 | oldchar = 0x80 |
| 52 | result = [] |
| 53 | oldindex = -1 |
| 54 | for c in extended: |
| 55 | index = pos = -1 |
| 56 | char = ord(c) |
| 57 | curlen = selective_len(str, char) |
| 58 | delta = (curlen+1) * (char - oldchar) |
| 59 | while 1: |
| 60 | index,pos = selective_find(str,c,index,pos) |
| 61 | if index == -1: |
| 62 | break |
| 63 | delta += index - oldindex |
| 64 | result.append(delta-1) |
| 65 | oldindex = index |
| 66 | delta = 0 |
| 67 | oldchar = char |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 68 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 69 | return result |
| 70 | |
| 71 | def T(j, bias): |
| 72 | # Punycode parameters: tmin = 1, tmax = 26, base = 36 |
| 73 | res = 36 * (j + 1) - bias |
| 74 | if res < 1: return 1 |
| 75 | if res > 26: return 26 |
| 76 | return res |
| 77 | |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 78 | digits = b"abcdefghijklmnopqrstuvwxyz0123456789" |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 79 | def generate_generalized_integer(N, bias): |
| 80 | """3.3 Generalized variable-length integers""" |
Guido van Rossum | 254348e | 2007-11-21 19:29:53 +0000 | [diff] [blame] | 81 | result = bytearray() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 82 | j = 0 |
| 83 | while 1: |
| 84 | t = T(j, bias) |
| 85 | if N < t: |
| 86 | result.append(digits[N]) |
Guido van Rossum | 98297ee | 2007-11-06 21:34:58 +0000 | [diff] [blame] | 87 | return bytes(result) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 88 | result.append(digits[t + ((N - t) % (36 - t))]) |
| 89 | N = (N - t) // (36 - t) |
| 90 | j += 1 |
| 91 | |
| 92 | def adapt(delta, first, numchars): |
| 93 | if first: |
| 94 | delta //= 700 |
| 95 | else: |
| 96 | delta //= 2 |
| 97 | delta += delta // numchars |
| 98 | # ((base - tmin) * tmax) // 2 == 455 |
| 99 | divisions = 0 |
| 100 | while delta > 455: |
| 101 | delta = delta // 35 # base - tmin |
| 102 | divisions += 36 |
| 103 | bias = divisions + (36 * delta // (delta + 38)) |
| 104 | return bias |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 105 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 106 | |
| 107 | def generate_integers(baselen, deltas): |
| 108 | """3.4 Bias adaptation""" |
| 109 | # Punycode parameters: initial bias = 72, damp = 700, skew = 38 |
Guido van Rossum | 254348e | 2007-11-21 19:29:53 +0000 | [diff] [blame] | 110 | result = bytearray() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 111 | bias = 72 |
| 112 | for points, delta in enumerate(deltas): |
| 113 | s = generate_generalized_integer(delta, bias) |
| 114 | result.extend(s) |
| 115 | bias = adapt(delta, points==0, baselen+points+1) |
Guido van Rossum | 98297ee | 2007-11-06 21:34:58 +0000 | [diff] [blame] | 116 | return bytes(result) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 117 | |
| 118 | def punycode_encode(text): |
| 119 | base, extended = segregate(text) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 120 | deltas = insertion_unsort(text, extended) |
| 121 | extended = generate_integers(len(base), deltas) |
| 122 | if base: |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 123 | return base + b"-" + extended |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 124 | return extended |
| 125 | |
| 126 | ##################### Decoding ##################################### |
| 127 | |
| 128 | def decode_generalized_number(extended, extpos, bias, errors): |
| 129 | """3.3 Generalized variable-length integers""" |
| 130 | result = 0 |
| 131 | w = 1 |
| 132 | j = 0 |
| 133 | while 1: |
| 134 | try: |
| 135 | char = ord(extended[extpos]) |
| 136 | except IndexError: |
| 137 | if errors == "strict": |
Collin Winter | ce36ad8 | 2007-08-30 01:19:48 +0000 | [diff] [blame] | 138 | raise UnicodeError("incomplete punicode string") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 139 | return extpos + 1, None |
| 140 | extpos += 1 |
| 141 | if 0x41 <= char <= 0x5A: # A-Z |
| 142 | digit = char - 0x41 |
| 143 | elif 0x30 <= char <= 0x39: |
| 144 | digit = char - 22 # 0x30-26 |
| 145 | elif errors == "strict": |
| 146 | raise UnicodeError("Invalid extended code point '%s'" |
| 147 | % extended[extpos]) |
| 148 | else: |
| 149 | return extpos, None |
| 150 | t = T(j, bias) |
| 151 | result += digit * w |
| 152 | if digit < t: |
| 153 | return extpos, result |
| 154 | w = w * (36 - t) |
| 155 | j += 1 |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 156 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 157 | |
| 158 | def insertion_sort(base, extended, errors): |
| 159 | """3.2 Insertion unsort coding""" |
| 160 | char = 0x80 |
| 161 | pos = -1 |
| 162 | bias = 72 |
| 163 | extpos = 0 |
| 164 | while extpos < len(extended): |
| 165 | newpos, delta = decode_generalized_number(extended, extpos, |
| 166 | bias, errors) |
| 167 | if delta is None: |
| 168 | # There was an error in decoding. We can't continue because |
| 169 | # synchronization is lost. |
| 170 | return base |
| 171 | pos += delta+1 |
| 172 | char += pos // (len(base) + 1) |
| 173 | if char > 0x10FFFF: |
| 174 | if errors == "strict": |
Collin Winter | ce36ad8 | 2007-08-30 01:19:48 +0000 | [diff] [blame] | 175 | raise UnicodeError("Invalid character U+%x" % char) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 176 | char = ord('?') |
| 177 | pos = pos % (len(base) + 1) |
Guido van Rossum | 84fc66d | 2007-05-03 17:18:26 +0000 | [diff] [blame] | 178 | base = base[:pos] + chr(char) + base[pos:] |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 179 | bias = adapt(delta, (extpos == 0), len(base)) |
| 180 | extpos = newpos |
| 181 | return base |
| 182 | |
| 183 | def punycode_decode(text, errors): |
Walter Dörwald | 0ac30f8 | 2007-05-11 10:32:57 +0000 | [diff] [blame] | 184 | if isinstance(text, str): |
| 185 | text = text.encode("ascii") |
Marc-André Lemburg | b2750b5 | 2008-06-06 12:18:17 +0000 | [diff] [blame] | 186 | if isinstance(text, memoryview): |
| 187 | text = bytes(text) |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 188 | pos = text.rfind(b"-") |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 189 | if pos == -1: |
| 190 | base = "" |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 191 | extended = str(text, "ascii").upper() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 192 | else: |
Walter Dörwald | a4c6128 | 2007-05-10 12:36:25 +0000 | [diff] [blame] | 193 | base = str(text[:pos], "ascii", errors) |
| 194 | extended = str(text[pos+1:], "ascii").upper() |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 195 | return insertion_sort(base, extended, errors) |
Tim Peters | 0eadaac | 2003-04-24 16:02:54 +0000 | [diff] [blame] | 196 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 197 | ### Codec APIs |
| 198 | |
| 199 | class Codec(codecs.Codec): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 200 | |
Walter Dörwald | 0ac30f8 | 2007-05-11 10:32:57 +0000 | [diff] [blame] | 201 | def encode(self, input, errors='strict'): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 202 | res = punycode_encode(input) |
| 203 | return res, len(input) |
| 204 | |
Walter Dörwald | 0ac30f8 | 2007-05-11 10:32:57 +0000 | [diff] [blame] | 205 | def decode(self, input, errors='strict'): |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 206 | if errors not in ('strict', 'replace', 'ignore'): |
Collin Winter | ce36ad8 | 2007-08-30 01:19:48 +0000 | [diff] [blame] | 207 | raise UnicodeError("Unsupported error handling "+errors) |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 208 | res = punycode_decode(input, errors) |
| 209 | return res, len(input) |
| 210 | |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 211 | class IncrementalEncoder(codecs.IncrementalEncoder): |
| 212 | def encode(self, input, final=False): |
| 213 | return punycode_encode(input) |
| 214 | |
| 215 | class IncrementalDecoder(codecs.IncrementalDecoder): |
| 216 | def decode(self, input, final=False): |
Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 217 | if self.errors not in ('strict', 'replace', 'ignore'): |
Collin Winter | ce36ad8 | 2007-08-30 01:19:48 +0000 | [diff] [blame] | 218 | raise UnicodeError("Unsupported error handling "+self.errors) |
Thomas Wouters | 0e3f591 | 2006-08-11 14:57:12 +0000 | [diff] [blame] | 219 | return punycode_decode(input, self.errors) |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 220 | |
Martin v. Löwis | 2548c73 | 2003-04-18 10:39:54 +0000 | [diff] [blame] | 221 | class StreamWriter(Codec,codecs.StreamWriter): |
| 222 | pass |
| 223 | |
| 224 | class StreamReader(Codec,codecs.StreamReader): |
| 225 | pass |
| 226 | |
| 227 | ### encodings module API |
| 228 | |
| 229 | def getregentry(): |
Thomas Wouters | a977329 | 2006-04-21 09:43:23 +0000 | [diff] [blame] | 230 | return codecs.CodecInfo( |
| 231 | name='punycode', |
| 232 | encode=Codec().encode, |
| 233 | decode=Codec().decode, |
| 234 | incrementalencoder=IncrementalEncoder, |
| 235 | incrementaldecoder=IncrementalDecoder, |
| 236 | streamwriter=StreamWriter, |
| 237 | streamreader=StreamReader, |
| 238 | ) |