blob: 82fd4585f57d03cd71a1483cd3c38d21fd7ce516 [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# -*- coding: iso-8859-1 -*-
2""" Codec for the Punicode encoding, as specified in RFC 3492
3
4Written by Martin v. Löwis.
5"""
6
7import codecs
8
9##################### Encoding #####################################
10
11def segregate(str):
Tim Peters0eadaac2003-04-24 16:02:54 +000012 """3.1 Basic code point segregation"""
Martin v. Löwis2548c732003-04-18 10:39:54 +000013 base = []
14 extended = {}
15 for c in str:
16 if ord(c) < 128:
17 base.append(c)
18 else:
19 extended[c] = 1
20 extended = extended.keys()
21 extended.sort()
22 return "".join(base).encode("ascii"),extended
23
24def selective_len(str, max):
25 """Return the length of str, considering only characters below max."""
26 res = 0
27 for c in str:
28 if ord(c) < max:
29 res += 1
30 return res
31
32def selective_find(str, char, index, pos):
33 """Return a pair (index, pos), indicating the next occurrence of
34 char in str. index is the position of the character considering
35 only ordinals up to and including char, and pos is the position in
36 the full string. index/pos is the starting position in the full
37 string."""
38
39 l = len(str)
40 while 1:
41 pos += 1
42 if pos == l:
43 return (-1, -1)
44 c = str[pos]
45 if c == char:
46 return index+1, pos
47 elif c < char:
48 index += 1
49
50def insertion_unsort(str, extended):
51 """3.2 Insertion unsort coding"""
52 oldchar = 0x80
53 result = []
54 oldindex = -1
55 for c in extended:
56 index = pos = -1
57 char = ord(c)
58 curlen = selective_len(str, char)
59 delta = (curlen+1) * (char - oldchar)
60 while 1:
61 index,pos = selective_find(str,c,index,pos)
62 if index == -1:
63 break
64 delta += index - oldindex
65 result.append(delta-1)
66 oldindex = index
67 delta = 0
68 oldchar = char
Tim Peters0eadaac2003-04-24 16:02:54 +000069
Martin v. Löwis2548c732003-04-18 10:39:54 +000070 return result
71
72def T(j, bias):
73 # Punycode parameters: tmin = 1, tmax = 26, base = 36
74 res = 36 * (j + 1) - bias
75 if res < 1: return 1
76 if res > 26: return 26
77 return res
78
79digits = "abcdefghijklmnopqrstuvwxyz0123456789"
80def generate_generalized_integer(N, bias):
81 """3.3 Generalized variable-length integers"""
82 result = []
83 j = 0
84 while 1:
85 t = T(j, bias)
86 if N < t:
87 result.append(digits[N])
88 return result
89 result.append(digits[t + ((N - t) % (36 - t))])
90 N = (N - t) // (36 - t)
91 j += 1
92
93def adapt(delta, first, numchars):
94 if first:
95 delta //= 700
96 else:
97 delta //= 2
98 delta += delta // numchars
99 # ((base - tmin) * tmax) // 2 == 455
100 divisions = 0
101 while delta > 455:
102 delta = delta // 35 # base - tmin
103 divisions += 36
104 bias = divisions + (36 * delta // (delta + 38))
105 return bias
Tim Peters0eadaac2003-04-24 16:02:54 +0000106
Martin v. Löwis2548c732003-04-18 10:39:54 +0000107
108def generate_integers(baselen, deltas):
109 """3.4 Bias adaptation"""
110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
111 result = []
112 bias = 72
113 for points, delta in enumerate(deltas):
114 s = generate_generalized_integer(delta, bias)
115 result.extend(s)
116 bias = adapt(delta, points==0, baselen+points+1)
117 return "".join(result)
118
119def punycode_encode(text):
120 base, extended = segregate(text)
121 base = base.encode("ascii")
122 deltas = insertion_unsort(text, extended)
123 extended = generate_integers(len(base), deltas)
124 if base:
125 return base + "-" + extended
126 return extended
127
128##################### Decoding #####################################
129
130def decode_generalized_number(extended, extpos, bias, errors):
131 """3.3 Generalized variable-length integers"""
132 result = 0
133 w = 1
134 j = 0
135 while 1:
136 try:
137 char = ord(extended[extpos])
138 except IndexError:
139 if errors == "strict":
140 raise UnicodeError, "incomplete punicode string"
141 return extpos + 1, None
142 extpos += 1
143 if 0x41 <= char <= 0x5A: # A-Z
144 digit = char - 0x41
145 elif 0x30 <= char <= 0x39:
146 digit = char - 22 # 0x30-26
147 elif errors == "strict":
148 raise UnicodeError("Invalid extended code point '%s'"
149 % extended[extpos])
150 else:
151 return extpos, None
152 t = T(j, bias)
153 result += digit * w
154 if digit < t:
155 return extpos, result
156 w = w * (36 - t)
157 j += 1
Tim Peters0eadaac2003-04-24 16:02:54 +0000158
Martin v. Löwis2548c732003-04-18 10:39:54 +0000159
160def insertion_sort(base, extended, errors):
161 """3.2 Insertion unsort coding"""
162 char = 0x80
163 pos = -1
164 bias = 72
165 extpos = 0
166 while extpos < len(extended):
167 newpos, delta = decode_generalized_number(extended, extpos,
168 bias, errors)
169 if delta is None:
170 # There was an error in decoding. We can't continue because
171 # synchronization is lost.
172 return base
173 pos += delta+1
174 char += pos // (len(base) + 1)
175 if char > 0x10FFFF:
176 if errors == "strict":
177 raise UnicodeError, ("Invalid character U+%x" % char)
178 char = ord('?')
179 pos = pos % (len(base) + 1)
180 base = base[:pos] + unichr(char) + base[pos:]
181 bias = adapt(delta, (extpos == 0), len(base))
182 extpos = newpos
183 return base
184
185def punycode_decode(text, errors):
186 pos = text.rfind("-")
187 if pos == -1:
188 base = ""
189 extended = text
190 else:
191 base = text[:pos]
192 extended = text[pos+1:]
193 base = unicode(base, "ascii", errors)
194 extended = extended.upper()
195 return insertion_sort(base, extended, errors)
Tim Peters0eadaac2003-04-24 16:02:54 +0000196
Martin v. Löwis2548c732003-04-18 10:39:54 +0000197### Codec APIs
198
199class Codec(codecs.Codec):
200 def encode(self,input,errors='strict'):
201
202 res = punycode_encode(input)
203 return res, len(input)
204
205 def decode(self,input,errors='strict'):
206
207 if errors not in ('strict', 'replace', 'ignore'):
208 raise UnicodeError, "Unsupported error handling "+errors
209 res = punycode_decode(input, errors)
210 return res, len(input)
211
212class StreamWriter(Codec,codecs.StreamWriter):
213 pass
214
215class StreamReader(Codec,codecs.StreamReader):
216 pass
217
218### encodings module API
219
220def getregentry():
221
222 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)