blob: 66c51013ea431ace41d4c7b20da8df3349dc834c [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001""" Codec for the Punicode encoding, as specified in RFC 3492
2
Guido van Rossumd77d6992007-07-16 23:10:57 +00003Written by Martin v. Löwis.
Martin v. Löwis2548c732003-04-18 10:39:54 +00004"""
5
6import codecs
7
8##################### Encoding #####################################
9
10def segregate(str):
Tim Peters0eadaac2003-04-24 16:02:54 +000011 """3.1 Basic code point segregation"""
Guido van Rossum254348e2007-11-21 19:29:53 +000012 base = bytearray()
Walter Dörwalda4c61282007-05-10 12:36:25 +000013 extended = set()
Martin v. Löwis2548c732003-04-18 10:39:54 +000014 for c in str:
15 if ord(c) < 128:
Walter Dörwalda4c61282007-05-10 12:36:25 +000016 base.append(ord(c))
Martin v. Löwis2548c732003-04-18 10:39:54 +000017 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +000018 extended.add(c)
19 extended = sorted(extended)
Guido van Rossum98297ee2007-11-06 21:34:58 +000020 return bytes(base), extended
Martin v. Löwis2548c732003-04-18 10:39:54 +000021
22def selective_len(str, max):
23 """Return the length of str, considering only characters below max."""
24 res = 0
25 for c in str:
26 if ord(c) < max:
27 res += 1
28 return res
29
30def selective_find(str, char, index, pos):
31 """Return a pair (index, pos), indicating the next occurrence of
32 char in str. index is the position of the character considering
33 only ordinals up to and including char, and pos is the position in
34 the full string. index/pos is the starting position in the full
35 string."""
36
37 l = len(str)
38 while 1:
39 pos += 1
40 if pos == l:
41 return (-1, -1)
42 c = str[pos]
43 if c == char:
44 return index+1, pos
45 elif c < char:
46 index += 1
47
48def insertion_unsort(str, extended):
49 """3.2 Insertion unsort coding"""
50 oldchar = 0x80
51 result = []
52 oldindex = -1
53 for c in extended:
54 index = pos = -1
55 char = ord(c)
56 curlen = selective_len(str, char)
57 delta = (curlen+1) * (char - oldchar)
58 while 1:
59 index,pos = selective_find(str,c,index,pos)
60 if index == -1:
61 break
62 delta += index - oldindex
63 result.append(delta-1)
64 oldindex = index
65 delta = 0
66 oldchar = char
Tim Peters0eadaac2003-04-24 16:02:54 +000067
Martin v. Löwis2548c732003-04-18 10:39:54 +000068 return result
69
70def T(j, bias):
71 # Punycode parameters: tmin = 1, tmax = 26, base = 36
72 res = 36 * (j + 1) - bias
73 if res < 1: return 1
74 if res > 26: return 26
75 return res
76
Walter Dörwalda4c61282007-05-10 12:36:25 +000077digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
Martin v. Löwis2548c732003-04-18 10:39:54 +000078def generate_generalized_integer(N, bias):
79 """3.3 Generalized variable-length integers"""
Guido van Rossum254348e2007-11-21 19:29:53 +000080 result = bytearray()
Martin v. Löwis2548c732003-04-18 10:39:54 +000081 j = 0
82 while 1:
83 t = T(j, bias)
84 if N < t:
85 result.append(digits[N])
Guido van Rossum98297ee2007-11-06 21:34:58 +000086 return bytes(result)
Martin v. Löwis2548c732003-04-18 10:39:54 +000087 result.append(digits[t + ((N - t) % (36 - t))])
88 N = (N - t) // (36 - t)
89 j += 1
90
91def adapt(delta, first, numchars):
92 if first:
93 delta //= 700
94 else:
95 delta //= 2
96 delta += delta // numchars
97 # ((base - tmin) * tmax) // 2 == 455
98 divisions = 0
99 while delta > 455:
100 delta = delta // 35 # base - tmin
101 divisions += 36
102 bias = divisions + (36 * delta // (delta + 38))
103 return bias
Tim Peters0eadaac2003-04-24 16:02:54 +0000104
Martin v. Löwis2548c732003-04-18 10:39:54 +0000105
106def generate_integers(baselen, deltas):
107 """3.4 Bias adaptation"""
108 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
Guido van Rossum254348e2007-11-21 19:29:53 +0000109 result = bytearray()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000110 bias = 72
111 for points, delta in enumerate(deltas):
112 s = generate_generalized_integer(delta, bias)
113 result.extend(s)
114 bias = adapt(delta, points==0, baselen+points+1)
Guido van Rossum98297ee2007-11-06 21:34:58 +0000115 return bytes(result)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000116
117def punycode_encode(text):
118 base, extended = segregate(text)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000119 deltas = insertion_unsort(text, extended)
120 extended = generate_integers(len(base), deltas)
121 if base:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000122 return base + b"-" + extended
Martin v. Löwis2548c732003-04-18 10:39:54 +0000123 return extended
124
125##################### Decoding #####################################
126
127def decode_generalized_number(extended, extpos, bias, errors):
128 """3.3 Generalized variable-length integers"""
129 result = 0
130 w = 1
131 j = 0
132 while 1:
133 try:
134 char = ord(extended[extpos])
135 except IndexError:
136 if errors == "strict":
Collin Winterce36ad82007-08-30 01:19:48 +0000137 raise UnicodeError("incomplete punicode string")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000138 return extpos + 1, None
139 extpos += 1
140 if 0x41 <= char <= 0x5A: # A-Z
141 digit = char - 0x41
142 elif 0x30 <= char <= 0x39:
143 digit = char - 22 # 0x30-26
144 elif errors == "strict":
145 raise UnicodeError("Invalid extended code point '%s'"
146 % extended[extpos])
147 else:
148 return extpos, None
149 t = T(j, bias)
150 result += digit * w
151 if digit < t:
152 return extpos, result
153 w = w * (36 - t)
154 j += 1
Tim Peters0eadaac2003-04-24 16:02:54 +0000155
Martin v. Löwis2548c732003-04-18 10:39:54 +0000156
157def insertion_sort(base, extended, errors):
158 """3.2 Insertion unsort coding"""
159 char = 0x80
160 pos = -1
161 bias = 72
162 extpos = 0
163 while extpos < len(extended):
164 newpos, delta = decode_generalized_number(extended, extpos,
165 bias, errors)
166 if delta is None:
167 # There was an error in decoding. We can't continue because
168 # synchronization is lost.
169 return base
170 pos += delta+1
171 char += pos // (len(base) + 1)
172 if char > 0x10FFFF:
173 if errors == "strict":
Collin Winterce36ad82007-08-30 01:19:48 +0000174 raise UnicodeError("Invalid character U+%x" % char)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000175 char = ord('?')
176 pos = pos % (len(base) + 1)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000177 base = base[:pos] + chr(char) + base[pos:]
Martin v. Löwis2548c732003-04-18 10:39:54 +0000178 bias = adapt(delta, (extpos == 0), len(base))
179 extpos = newpos
180 return base
181
182def punycode_decode(text, errors):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000183 if isinstance(text, str):
184 text = text.encode("ascii")
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000185 if isinstance(text, memoryview):
186 text = bytes(text)
Walter Dörwalda4c61282007-05-10 12:36:25 +0000187 pos = text.rfind(b"-")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000188 if pos == -1:
189 base = ""
Walter Dörwalda4c61282007-05-10 12:36:25 +0000190 extended = str(text, "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000191 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000192 base = str(text[:pos], "ascii", errors)
193 extended = str(text[pos+1:], "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000194 return insertion_sort(base, extended, errors)
Tim Peters0eadaac2003-04-24 16:02:54 +0000195
Martin v. Löwis2548c732003-04-18 10:39:54 +0000196### Codec APIs
197
198class Codec(codecs.Codec):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000199
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000200 def encode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000201 res = punycode_encode(input)
202 return res, len(input)
203
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000204 def decode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000205 if errors not in ('strict', 'replace', 'ignore'):
Collin Winterce36ad82007-08-30 01:19:48 +0000206 raise UnicodeError("Unsupported error handling "+errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000207 res = punycode_decode(input, errors)
208 return res, len(input)
209
Thomas Woutersa9773292006-04-21 09:43:23 +0000210class IncrementalEncoder(codecs.IncrementalEncoder):
211 def encode(self, input, final=False):
212 return punycode_encode(input)
213
214class IncrementalDecoder(codecs.IncrementalDecoder):
215 def decode(self, input, final=False):
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000216 if self.errors not in ('strict', 'replace', 'ignore'):
Collin Winterce36ad82007-08-30 01:19:48 +0000217 raise UnicodeError("Unsupported error handling "+self.errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000218 return punycode_decode(input, self.errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000219
Martin v. Löwis2548c732003-04-18 10:39:54 +0000220class StreamWriter(Codec,codecs.StreamWriter):
221 pass
222
223class StreamReader(Codec,codecs.StreamReader):
224 pass
225
226### encodings module API
227
228def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000229 return codecs.CodecInfo(
230 name='punycode',
231 encode=Codec().encode,
232 decode=Codec().decode,
233 incrementalencoder=IncrementalEncoder,
234 incrementaldecoder=IncrementalDecoder,
235 streamwriter=StreamWriter,
236 streamreader=StreamReader,
237 )