blob: 8129af2543a9ae46b2b039ca843408834fce81d0 [file] [log] [blame]
Guido van Rossumd77d6992007-07-16 23:10:57 +00001# -*- coding: utf-8 -*-
Martin v. Löwis2548c732003-04-18 10:39:54 +00002""" Codec for the Punicode encoding, as specified in RFC 3492
3
Guido van Rossumd77d6992007-07-16 23:10:57 +00004Written by Martin v. Löwis.
Martin v. Löwis2548c732003-04-18 10:39:54 +00005"""
6
7import codecs
8
9##################### Encoding #####################################
10
11def segregate(str):
Tim Peters0eadaac2003-04-24 16:02:54 +000012 """3.1 Basic code point segregation"""
Guido van Rossum254348e2007-11-21 19:29:53 +000013 base = bytearray()
Walter Dörwalda4c61282007-05-10 12:36:25 +000014 extended = set()
Martin v. Löwis2548c732003-04-18 10:39:54 +000015 for c in str:
16 if ord(c) < 128:
Walter Dörwalda4c61282007-05-10 12:36:25 +000017 base.append(ord(c))
Martin v. Löwis2548c732003-04-18 10:39:54 +000018 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +000019 extended.add(c)
20 extended = sorted(extended)
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 return bytes(base), extended
Martin v. Löwis2548c732003-04-18 10:39:54 +000022
23def selective_len(str, max):
24 """Return the length of str, considering only characters below max."""
25 res = 0
26 for c in str:
27 if ord(c) < max:
28 res += 1
29 return res
30
31def selective_find(str, char, index, pos):
32 """Return a pair (index, pos), indicating the next occurrence of
33 char in str. index is the position of the character considering
34 only ordinals up to and including char, and pos is the position in
35 the full string. index/pos is the starting position in the full
36 string."""
37
38 l = len(str)
39 while 1:
40 pos += 1
41 if pos == l:
42 return (-1, -1)
43 c = str[pos]
44 if c == char:
45 return index+1, pos
46 elif c < char:
47 index += 1
48
49def insertion_unsort(str, extended):
50 """3.2 Insertion unsort coding"""
51 oldchar = 0x80
52 result = []
53 oldindex = -1
54 for c in extended:
55 index = pos = -1
56 char = ord(c)
57 curlen = selective_len(str, char)
58 delta = (curlen+1) * (char - oldchar)
59 while 1:
60 index,pos = selective_find(str,c,index,pos)
61 if index == -1:
62 break
63 delta += index - oldindex
64 result.append(delta-1)
65 oldindex = index
66 delta = 0
67 oldchar = char
Tim Peters0eadaac2003-04-24 16:02:54 +000068
Martin v. Löwis2548c732003-04-18 10:39:54 +000069 return result
70
71def T(j, bias):
72 # Punycode parameters: tmin = 1, tmax = 26, base = 36
73 res = 36 * (j + 1) - bias
74 if res < 1: return 1
75 if res > 26: return 26
76 return res
77
Walter Dörwalda4c61282007-05-10 12:36:25 +000078digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
Martin v. Löwis2548c732003-04-18 10:39:54 +000079def generate_generalized_integer(N, bias):
80 """3.3 Generalized variable-length integers"""
Guido van Rossum254348e2007-11-21 19:29:53 +000081 result = bytearray()
Martin v. Löwis2548c732003-04-18 10:39:54 +000082 j = 0
83 while 1:
84 t = T(j, bias)
85 if N < t:
86 result.append(digits[N])
Guido van Rossum98297ee2007-11-06 21:34:58 +000087 return bytes(result)
Martin v. Löwis2548c732003-04-18 10:39:54 +000088 result.append(digits[t + ((N - t) % (36 - t))])
89 N = (N - t) // (36 - t)
90 j += 1
91
92def adapt(delta, first, numchars):
93 if first:
94 delta //= 700
95 else:
96 delta //= 2
97 delta += delta // numchars
98 # ((base - tmin) * tmax) // 2 == 455
99 divisions = 0
100 while delta > 455:
101 delta = delta // 35 # base - tmin
102 divisions += 36
103 bias = divisions + (36 * delta // (delta + 38))
104 return bias
Tim Peters0eadaac2003-04-24 16:02:54 +0000105
Martin v. Löwis2548c732003-04-18 10:39:54 +0000106
107def generate_integers(baselen, deltas):
108 """3.4 Bias adaptation"""
109 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
Guido van Rossum254348e2007-11-21 19:29:53 +0000110 result = bytearray()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000111 bias = 72
112 for points, delta in enumerate(deltas):
113 s = generate_generalized_integer(delta, bias)
114 result.extend(s)
115 bias = adapt(delta, points==0, baselen+points+1)
Guido van Rossum98297ee2007-11-06 21:34:58 +0000116 return bytes(result)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000117
118def punycode_encode(text):
119 base, extended = segregate(text)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000120 deltas = insertion_unsort(text, extended)
121 extended = generate_integers(len(base), deltas)
122 if base:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000123 return base + b"-" + extended
Martin v. Löwis2548c732003-04-18 10:39:54 +0000124 return extended
125
126##################### Decoding #####################################
127
128def decode_generalized_number(extended, extpos, bias, errors):
129 """3.3 Generalized variable-length integers"""
130 result = 0
131 w = 1
132 j = 0
133 while 1:
134 try:
135 char = ord(extended[extpos])
136 except IndexError:
137 if errors == "strict":
Collin Winterce36ad82007-08-30 01:19:48 +0000138 raise UnicodeError("incomplete punicode string")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000139 return extpos + 1, None
140 extpos += 1
141 if 0x41 <= char <= 0x5A: # A-Z
142 digit = char - 0x41
143 elif 0x30 <= char <= 0x39:
144 digit = char - 22 # 0x30-26
145 elif errors == "strict":
146 raise UnicodeError("Invalid extended code point '%s'"
147 % extended[extpos])
148 else:
149 return extpos, None
150 t = T(j, bias)
151 result += digit * w
152 if digit < t:
153 return extpos, result
154 w = w * (36 - t)
155 j += 1
Tim Peters0eadaac2003-04-24 16:02:54 +0000156
Martin v. Löwis2548c732003-04-18 10:39:54 +0000157
158def insertion_sort(base, extended, errors):
159 """3.2 Insertion unsort coding"""
160 char = 0x80
161 pos = -1
162 bias = 72
163 extpos = 0
164 while extpos < len(extended):
165 newpos, delta = decode_generalized_number(extended, extpos,
166 bias, errors)
167 if delta is None:
168 # There was an error in decoding. We can't continue because
169 # synchronization is lost.
170 return base
171 pos += delta+1
172 char += pos // (len(base) + 1)
173 if char > 0x10FFFF:
174 if errors == "strict":
Collin Winterce36ad82007-08-30 01:19:48 +0000175 raise UnicodeError("Invalid character U+%x" % char)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000176 char = ord('?')
177 pos = pos % (len(base) + 1)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000178 base = base[:pos] + chr(char) + base[pos:]
Martin v. Löwis2548c732003-04-18 10:39:54 +0000179 bias = adapt(delta, (extpos == 0), len(base))
180 extpos = newpos
181 return base
182
183def punycode_decode(text, errors):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000184 if isinstance(text, str):
185 text = text.encode("ascii")
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000186 if isinstance(text, memoryview):
187 text = bytes(text)
Walter Dörwalda4c61282007-05-10 12:36:25 +0000188 pos = text.rfind(b"-")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000189 if pos == -1:
190 base = ""
Walter Dörwalda4c61282007-05-10 12:36:25 +0000191 extended = str(text, "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000192 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000193 base = str(text[:pos], "ascii", errors)
194 extended = str(text[pos+1:], "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000195 return insertion_sort(base, extended, errors)
Tim Peters0eadaac2003-04-24 16:02:54 +0000196
Martin v. Löwis2548c732003-04-18 10:39:54 +0000197### Codec APIs
198
199class Codec(codecs.Codec):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000200
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000201 def encode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000202 res = punycode_encode(input)
203 return res, len(input)
204
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000205 def decode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000206 if errors not in ('strict', 'replace', 'ignore'):
Collin Winterce36ad82007-08-30 01:19:48 +0000207 raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000208 res = punycode_decode(input, errors)
209 return res, len(input)
210
Thomas Woutersa9773292006-04-21 09:43:23 +0000211class IncrementalEncoder(codecs.IncrementalEncoder):
212 def encode(self, input, final=False):
213 return punycode_encode(input)
214
215class IncrementalDecoder(codecs.IncrementalDecoder):
216 def decode(self, input, final=False):
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000217 if self.errors not in ('strict', 'replace', 'ignore'):
Collin Winterce36ad82007-08-30 01:19:48 +0000218 raise UnicodeError("Unsupported error handling "+self.errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000219 return punycode_decode(input, self.errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000220
Martin v. Löwis2548c732003-04-18 10:39:54 +0000221class StreamWriter(Codec,codecs.StreamWriter):
222 pass
223
224class StreamReader(Codec,codecs.StreamReader):
225 pass
226
227### encodings module API
228
229def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000230 return codecs.CodecInfo(
231 name='punycode',
232 encode=Codec().encode,
233 decode=Codec().decode,
234 incrementalencoder=IncrementalEncoder,
235 incrementaldecoder=IncrementalDecoder,
236 streamwriter=StreamWriter,
237 streamreader=StreamReader,
238 )