blob: 4c22fe5166e1a769f2877d671fcf52321789fd0e [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# -*- coding: iso-8859-1 -*-
2""" Codec for the Punicode encoding, as specified in RFC 3492
3
4Written by Martin v. Löwis.
5"""
6
7import codecs
8
9##################### Encoding #####################################
10
11def segregate(str):
Tim Peters0eadaac2003-04-24 16:02:54 +000012 """3.1 Basic code point segregation"""
Walter Dörwalda4c61282007-05-10 12:36:25 +000013 base = b""
14 extended = set()
Martin v. Löwis2548c732003-04-18 10:39:54 +000015 for c in str:
16 if ord(c) < 128:
Walter Dörwalda4c61282007-05-10 12:36:25 +000017 base.append(ord(c))
Martin v. Löwis2548c732003-04-18 10:39:54 +000018 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +000019 extended.add(c)
20 extended = sorted(extended)
21 return (base, extended)
Martin v. Löwis2548c732003-04-18 10:39:54 +000022
23def selective_len(str, max):
24 """Return the length of str, considering only characters below max."""
25 res = 0
26 for c in str:
27 if ord(c) < max:
28 res += 1
29 return res
30
31def selective_find(str, char, index, pos):
32 """Return a pair (index, pos), indicating the next occurrence of
33 char in str. index is the position of the character considering
34 only ordinals up to and including char, and pos is the position in
35 the full string. index/pos is the starting position in the full
36 string."""
37
38 l = len(str)
39 while 1:
40 pos += 1
41 if pos == l:
42 return (-1, -1)
43 c = str[pos]
44 if c == char:
45 return index+1, pos
46 elif c < char:
47 index += 1
48
49def insertion_unsort(str, extended):
50 """3.2 Insertion unsort coding"""
51 oldchar = 0x80
52 result = []
53 oldindex = -1
54 for c in extended:
55 index = pos = -1
56 char = ord(c)
57 curlen = selective_len(str, char)
58 delta = (curlen+1) * (char - oldchar)
59 while 1:
60 index,pos = selective_find(str,c,index,pos)
61 if index == -1:
62 break
63 delta += index - oldindex
64 result.append(delta-1)
65 oldindex = index
66 delta = 0
67 oldchar = char
Tim Peters0eadaac2003-04-24 16:02:54 +000068
Martin v. Löwis2548c732003-04-18 10:39:54 +000069 return result
70
71def T(j, bias):
72 # Punycode parameters: tmin = 1, tmax = 26, base = 36
73 res = 36 * (j + 1) - bias
74 if res < 1: return 1
75 if res > 26: return 26
76 return res
77
Walter Dörwalda4c61282007-05-10 12:36:25 +000078digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
Martin v. Löwis2548c732003-04-18 10:39:54 +000079def generate_generalized_integer(N, bias):
80 """3.3 Generalized variable-length integers"""
Walter Dörwalda4c61282007-05-10 12:36:25 +000081 result = b""
Martin v. Löwis2548c732003-04-18 10:39:54 +000082 j = 0
83 while 1:
84 t = T(j, bias)
85 if N < t:
86 result.append(digits[N])
87 return result
88 result.append(digits[t + ((N - t) % (36 - t))])
89 N = (N - t) // (36 - t)
90 j += 1
91
92def adapt(delta, first, numchars):
93 if first:
94 delta //= 700
95 else:
96 delta //= 2
97 delta += delta // numchars
98 # ((base - tmin) * tmax) // 2 == 455
99 divisions = 0
100 while delta > 455:
101 delta = delta // 35 # base - tmin
102 divisions += 36
103 bias = divisions + (36 * delta // (delta + 38))
104 return bias
Tim Peters0eadaac2003-04-24 16:02:54 +0000105
Martin v. Löwis2548c732003-04-18 10:39:54 +0000106
107def generate_integers(baselen, deltas):
108 """3.4 Bias adaptation"""
109 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
Walter Dörwalda4c61282007-05-10 12:36:25 +0000110 result = b""
Martin v. Löwis2548c732003-04-18 10:39:54 +0000111 bias = 72
112 for points, delta in enumerate(deltas):
113 s = generate_generalized_integer(delta, bias)
114 result.extend(s)
115 bias = adapt(delta, points==0, baselen+points+1)
Walter Dörwalda4c61282007-05-10 12:36:25 +0000116 return result
Martin v. Löwis2548c732003-04-18 10:39:54 +0000117
118def punycode_encode(text):
119 base, extended = segregate(text)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000120 deltas = insertion_unsort(text, extended)
121 extended = generate_integers(len(base), deltas)
122 if base:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000123 return base + b"-" + extended
Martin v. Löwis2548c732003-04-18 10:39:54 +0000124 return extended
125
126##################### Decoding #####################################
127
128def decode_generalized_number(extended, extpos, bias, errors):
129 """3.3 Generalized variable-length integers"""
130 result = 0
131 w = 1
132 j = 0
133 while 1:
134 try:
135 char = ord(extended[extpos])
136 except IndexError:
137 if errors == "strict":
138 raise UnicodeError, "incomplete punicode string"
139 return extpos + 1, None
140 extpos += 1
141 if 0x41 <= char <= 0x5A: # A-Z
142 digit = char - 0x41
143 elif 0x30 <= char <= 0x39:
144 digit = char - 22 # 0x30-26
145 elif errors == "strict":
146 raise UnicodeError("Invalid extended code point '%s'"
147 % extended[extpos])
148 else:
149 return extpos, None
150 t = T(j, bias)
151 result += digit * w
152 if digit < t:
153 return extpos, result
154 w = w * (36 - t)
155 j += 1
Tim Peters0eadaac2003-04-24 16:02:54 +0000156
Martin v. Löwis2548c732003-04-18 10:39:54 +0000157
158def insertion_sort(base, extended, errors):
159 """3.2 Insertion unsort coding"""
160 char = 0x80
161 pos = -1
162 bias = 72
163 extpos = 0
164 while extpos < len(extended):
165 newpos, delta = decode_generalized_number(extended, extpos,
166 bias, errors)
167 if delta is None:
168 # There was an error in decoding. We can't continue because
169 # synchronization is lost.
170 return base
171 pos += delta+1
172 char += pos // (len(base) + 1)
173 if char > 0x10FFFF:
174 if errors == "strict":
175 raise UnicodeError, ("Invalid character U+%x" % char)
176 char = ord('?')
177 pos = pos % (len(base) + 1)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000178 base = base[:pos] + chr(char) + base[pos:]
Martin v. Löwis2548c732003-04-18 10:39:54 +0000179 bias = adapt(delta, (extpos == 0), len(base))
180 extpos = newpos
181 return base
182
183def punycode_decode(text, errors):
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000184 if isinstance(text, str):
185 text = text.encode("ascii")
Walter Dörwalda4c61282007-05-10 12:36:25 +0000186 pos = text.rfind(b"-")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000187 if pos == -1:
188 base = ""
Walter Dörwalda4c61282007-05-10 12:36:25 +0000189 extended = str(text, "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000190 else:
Walter Dörwalda4c61282007-05-10 12:36:25 +0000191 base = str(text[:pos], "ascii", errors)
192 extended = str(text[pos+1:], "ascii").upper()
Martin v. Löwis2548c732003-04-18 10:39:54 +0000193 return insertion_sort(base, extended, errors)
Tim Peters0eadaac2003-04-24 16:02:54 +0000194
Martin v. Löwis2548c732003-04-18 10:39:54 +0000195### Codec APIs
196
197class Codec(codecs.Codec):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000198
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000199 def encode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000200 res = punycode_encode(input)
201 return res, len(input)
202
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000203 def decode(self, input, errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000204 if errors not in ('strict', 'replace', 'ignore'):
205 raise UnicodeError, "Unsupported error handling "+errors
206 res = punycode_decode(input, errors)
207 return res, len(input)
208
Thomas Woutersa9773292006-04-21 09:43:23 +0000209class IncrementalEncoder(codecs.IncrementalEncoder):
210 def encode(self, input, final=False):
211 return punycode_encode(input)
212
213class IncrementalDecoder(codecs.IncrementalDecoder):
214 def decode(self, input, final=False):
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000215 if self.errors not in ('strict', 'replace', 'ignore'):
216 raise UnicodeError, "Unsupported error handling "+self.errors
217 return punycode_decode(input, self.errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000218
Martin v. Löwis2548c732003-04-18 10:39:54 +0000219class StreamWriter(Codec,codecs.StreamWriter):
220 pass
221
222class StreamReader(Codec,codecs.StreamReader):
223 pass
224
225### encodings module API
226
227def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000228 return codecs.CodecInfo(
229 name='punycode',
230 encode=Codec().encode,
231 decode=Codec().decode,
232 incrementalencoder=IncrementalEncoder,
233 incrementaldecoder=IncrementalDecoder,
234 streamwriter=StreamWriter,
235 streamreader=StreamReader,
236 )