blob: 89906ae2827a973b35418991f4f3e9f35f4bc370 [file] [log] [blame]
Martin v. Löwis2548c732003-04-18 10:39:54 +00001# -*- coding: iso-8859-1 -*-
2""" Codec for the Punicode encoding, as specified in RFC 3492
3
4Written by Martin v. Löwis.
5"""
6
7import codecs
8
9##################### Encoding #####################################
10
11def segregate(str):
Tim Peters0eadaac2003-04-24 16:02:54 +000012 """3.1 Basic code point segregation"""
Martin v. Löwis2548c732003-04-18 10:39:54 +000013 base = []
14 extended = {}
15 for c in str:
16 if ord(c) < 128:
17 base.append(c)
18 else:
19 extended[c] = 1
Guido van Rossumcc2b0162007-02-11 06:12:03 +000020 extended = sorted(extended.keys())
Martin v. Löwis2548c732003-04-18 10:39:54 +000021 return "".join(base).encode("ascii"),extended
22
23def selective_len(str, max):
24 """Return the length of str, considering only characters below max."""
25 res = 0
26 for c in str:
27 if ord(c) < max:
28 res += 1
29 return res
30
31def selective_find(str, char, index, pos):
32 """Return a pair (index, pos), indicating the next occurrence of
33 char in str. index is the position of the character considering
34 only ordinals up to and including char, and pos is the position in
35 the full string. index/pos is the starting position in the full
36 string."""
37
38 l = len(str)
39 while 1:
40 pos += 1
41 if pos == l:
42 return (-1, -1)
43 c = str[pos]
44 if c == char:
45 return index+1, pos
46 elif c < char:
47 index += 1
48
49def insertion_unsort(str, extended):
50 """3.2 Insertion unsort coding"""
51 oldchar = 0x80
52 result = []
53 oldindex = -1
54 for c in extended:
55 index = pos = -1
56 char = ord(c)
57 curlen = selective_len(str, char)
58 delta = (curlen+1) * (char - oldchar)
59 while 1:
60 index,pos = selective_find(str,c,index,pos)
61 if index == -1:
62 break
63 delta += index - oldindex
64 result.append(delta-1)
65 oldindex = index
66 delta = 0
67 oldchar = char
Tim Peters0eadaac2003-04-24 16:02:54 +000068
Martin v. Löwis2548c732003-04-18 10:39:54 +000069 return result
70
71def T(j, bias):
72 # Punycode parameters: tmin = 1, tmax = 26, base = 36
73 res = 36 * (j + 1) - bias
74 if res < 1: return 1
75 if res > 26: return 26
76 return res
77
78digits = "abcdefghijklmnopqrstuvwxyz0123456789"
79def generate_generalized_integer(N, bias):
80 """3.3 Generalized variable-length integers"""
81 result = []
82 j = 0
83 while 1:
84 t = T(j, bias)
85 if N < t:
86 result.append(digits[N])
87 return result
88 result.append(digits[t + ((N - t) % (36 - t))])
89 N = (N - t) // (36 - t)
90 j += 1
91
92def adapt(delta, first, numchars):
93 if first:
94 delta //= 700
95 else:
96 delta //= 2
97 delta += delta // numchars
98 # ((base - tmin) * tmax) // 2 == 455
99 divisions = 0
100 while delta > 455:
101 delta = delta // 35 # base - tmin
102 divisions += 36
103 bias = divisions + (36 * delta // (delta + 38))
104 return bias
Tim Peters0eadaac2003-04-24 16:02:54 +0000105
Martin v. Löwis2548c732003-04-18 10:39:54 +0000106
107def generate_integers(baselen, deltas):
108 """3.4 Bias adaptation"""
109 # Punycode parameters: initial bias = 72, damp = 700, skew = 38
110 result = []
111 bias = 72
112 for points, delta in enumerate(deltas):
113 s = generate_generalized_integer(delta, bias)
114 result.extend(s)
115 bias = adapt(delta, points==0, baselen+points+1)
116 return "".join(result)
117
118def punycode_encode(text):
119 base, extended = segregate(text)
120 base = base.encode("ascii")
121 deltas = insertion_unsort(text, extended)
122 extended = generate_integers(len(base), deltas)
123 if base:
124 return base + "-" + extended
125 return extended
126
127##################### Decoding #####################################
128
129def decode_generalized_number(extended, extpos, bias, errors):
130 """3.3 Generalized variable-length integers"""
131 result = 0
132 w = 1
133 j = 0
134 while 1:
135 try:
136 char = ord(extended[extpos])
137 except IndexError:
138 if errors == "strict":
139 raise UnicodeError, "incomplete punicode string"
140 return extpos + 1, None
141 extpos += 1
142 if 0x41 <= char <= 0x5A: # A-Z
143 digit = char - 0x41
144 elif 0x30 <= char <= 0x39:
145 digit = char - 22 # 0x30-26
146 elif errors == "strict":
147 raise UnicodeError("Invalid extended code point '%s'"
148 % extended[extpos])
149 else:
150 return extpos, None
151 t = T(j, bias)
152 result += digit * w
153 if digit < t:
154 return extpos, result
155 w = w * (36 - t)
156 j += 1
Tim Peters0eadaac2003-04-24 16:02:54 +0000157
Martin v. Löwis2548c732003-04-18 10:39:54 +0000158
159def insertion_sort(base, extended, errors):
160 """3.2 Insertion unsort coding"""
161 char = 0x80
162 pos = -1
163 bias = 72
164 extpos = 0
165 while extpos < len(extended):
166 newpos, delta = decode_generalized_number(extended, extpos,
167 bias, errors)
168 if delta is None:
169 # There was an error in decoding. We can't continue because
170 # synchronization is lost.
171 return base
172 pos += delta+1
173 char += pos // (len(base) + 1)
174 if char > 0x10FFFF:
175 if errors == "strict":
176 raise UnicodeError, ("Invalid character U+%x" % char)
177 char = ord('?')
178 pos = pos % (len(base) + 1)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000179 base = base[:pos] + chr(char) + base[pos:]
Martin v. Löwis2548c732003-04-18 10:39:54 +0000180 bias = adapt(delta, (extpos == 0), len(base))
181 extpos = newpos
182 return base
183
184def punycode_decode(text, errors):
185 pos = text.rfind("-")
186 if pos == -1:
187 base = ""
188 extended = text
189 else:
190 base = text[:pos]
191 extended = text[pos+1:]
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 base = str(base, "ascii", errors)
Martin v. Löwis2548c732003-04-18 10:39:54 +0000193 extended = extended.upper()
194 return insertion_sort(base, extended, errors)
Tim Peters0eadaac2003-04-24 16:02:54 +0000195
Martin v. Löwis2548c732003-04-18 10:39:54 +0000196### Codec APIs
197
198class Codec(codecs.Codec):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000199
Thomas Woutersa9773292006-04-21 09:43:23 +0000200 def encode(self,input,errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000201 res = punycode_encode(input)
202 return res, len(input)
203
204 def decode(self,input,errors='strict'):
Martin v. Löwis2548c732003-04-18 10:39:54 +0000205 if errors not in ('strict', 'replace', 'ignore'):
206 raise UnicodeError, "Unsupported error handling "+errors
207 res = punycode_decode(input, errors)
208 return res, len(input)
209
Thomas Woutersa9773292006-04-21 09:43:23 +0000210class IncrementalEncoder(codecs.IncrementalEncoder):
211 def encode(self, input, final=False):
212 return punycode_encode(input)
213
214class IncrementalDecoder(codecs.IncrementalDecoder):
215 def decode(self, input, final=False):
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000216 if self.errors not in ('strict', 'replace', 'ignore'):
217 raise UnicodeError, "Unsupported error handling "+self.errors
218 return punycode_decode(input, self.errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000219
Martin v. Löwis2548c732003-04-18 10:39:54 +0000220class StreamWriter(Codec,codecs.StreamWriter):
221 pass
222
223class StreamReader(Codec,codecs.StreamReader):
224 pass
225
226### encodings module API
227
228def getregentry():
Thomas Woutersa9773292006-04-21 09:43:23 +0000229 return codecs.CodecInfo(
230 name='punycode',
231 encode=Codec().encode,
232 decode=Codec().decode,
233 incrementalencoder=IncrementalEncoder,
234 incrementaldecoder=IncrementalDecoder,
235 streamwriter=StreamWriter,
236 streamreader=StreamReader,
237 )