blob: 366011c7a8503ca16a6fa94ec04305605ef4ec43 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
5/* Mask to check or force alignment of a pointer to C 'long' boundaries */
6#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
7
8/* Mask to quickly check whether a C 'long' contains a
9 non-ASCII, UTF8-encoded char. */
10#if (SIZEOF_LONG == 8)
11# define ASCII_CHAR_MASK 0x8080808080808080L
12#elif (SIZEOF_LONG == 4)
13# define ASCII_CHAR_MASK 0x80808080L
14#else
15# error C 'long' size should be either 4 or 8!
16#endif
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010025 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
39 if (!((size_t) s & LONG_PTR_MASK)) {
40 /* Help register allocation */
41 register const char *_s = s;
42 register STRINGLIB_CHAR *_p = p;
43 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020050#ifdef BYTEORDER_IS_LITTLE_ENDIAN
51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xC2) {
95 /* invalid sequence
96 \x80-\xBF -- continuation byte
97 \xC0-\xC1 -- fake 0000-007F */
98 goto InvalidStart;
99 }
100
101 if (ch < 0xE0) {
102 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
103 Py_UCS4 ch2;
104 if (end - s < 2) {
105 /* unexpected end of data: the caller will decide whether
106 it's an error or not */
107 break;
108 }
109 ch2 = (unsigned char)s[1];
110 if ((ch2 & 0xC0) != 0x80)
111 /* invalid continuation byte */
112 goto InvalidContinuation;
113 ch = (ch << 6) + ch2 -
114 ((0xC0 << 6) + 0x80);
115 assert ((ch > 0x007F) && (ch <= 0x07FF));
116 s += 2;
117 if (STRINGLIB_MAX_CHAR <= 0x007F ||
118 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
119 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
130 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100131 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200132 ch2 = (unsigned char)s[1];
133 ch3 = (unsigned char)s[2];
134 if ((ch2 & 0xC0) != 0x80 ||
135 (ch3 & 0xC0) != 0x80) {
136 /* invalid continuation byte */
137 goto InvalidContinuation;
138 }
139 if (ch == 0xE0) {
140 if (ch2 < 0xA0)
141 /* invalid sequence
142 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
143 goto InvalidContinuation;
144 }
145 else if (ch == 0xED && ch2 > 0x9F) {
146 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
147 will result in surrogates in range D800-DFFF. Surrogates are
148 not valid UTF-8 so they are rejected.
149 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
150 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
151 goto InvalidContinuation;
152 }
153 ch = (ch << 12) + (ch2 << 6) + ch3 -
154 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100155 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
156 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200157 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
158 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
159 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100160 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163
164 if (ch < 0xF5) {
165 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
166 Py_UCS4 ch2, ch3, ch4;
167 if (end - s < 4) {
168 /* unexpected end of data: the caller will decide whether
169 it's an error or not */
170 break;
171 }
172 ch2 = (unsigned char)s[1];
173 ch3 = (unsigned char)s[2];
174 ch4 = (unsigned char)s[3];
175 if ((ch2 & 0xC0) != 0x80 ||
176 (ch3 & 0xC0) != 0x80 ||
177 (ch4 & 0xC0) != 0x80) {
178 /* invalid continuation byte */
179 goto InvalidContinuation;
180 }
181 if (ch == 0xF0) {
182 if (ch2 < 0x90)
183 /* invalid sequence
184 \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
185 goto InvalidContinuation;
186 }
187 else if (ch == 0xF4 && ch2 > 0x8F) {
188 /* invalid sequence
189 \xF4\x90\x80\80- -- 110000- overflow */
190 goto InvalidContinuation;
191 }
192 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
193 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
194 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
195 s += 4;
196 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
197 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
198 goto Overflow;
199 *p++ = ch;
200 continue;
201 }
202 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100203 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200204 ch = 0;
205Overflow:
206Return:
207 *inptr = s;
208 *outpos = p - dest;
209 return ch;
210InvalidStart:
211 ch = 1;
212 goto Return;
213InvalidContinuation:
214 ch = 2;
215 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100216}
217
218#undef LONG_PTR_MASK
219#undef ASCII_CHAR_MASK
220
Victor Stinner6099a032011-12-18 14:22:26 +0100221
222/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
223 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
224 UCS-1 strings don't need to handle surrogates for example. */
225Py_LOCAL_INLINE(PyObject *)
226STRINGLIB(utf8_encoder)(PyObject *unicode,
227 STRINGLIB_CHAR *data,
228 Py_ssize_t size,
229 const char *errors)
230{
231#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
232
233 Py_ssize_t i; /* index into s of next input byte */
234 PyObject *result; /* result string object */
235 char *p; /* next free byte in output buffer */
236 Py_ssize_t nallocated; /* number of result bytes allocated */
237 Py_ssize_t nneeded; /* number of result bytes needed */
238#if STRINGLIB_SIZEOF_CHAR > 1
239 PyObject *errorHandler = NULL;
240 PyObject *exc = NULL;
241 PyObject *rep = NULL;
242#endif
243#if STRINGLIB_SIZEOF_CHAR == 1
244 const Py_ssize_t max_char_size = 2;
245 char stackbuf[MAX_SHORT_UNICHARS * 2];
246#elif STRINGLIB_SIZEOF_CHAR == 2
247 const Py_ssize_t max_char_size = 3;
248 char stackbuf[MAX_SHORT_UNICHARS * 3];
249#else /* STRINGLIB_SIZEOF_CHAR == 4 */
250 const Py_ssize_t max_char_size = 4;
251 char stackbuf[MAX_SHORT_UNICHARS * 4];
252#endif
253
254 assert(size >= 0);
255
256 if (size <= MAX_SHORT_UNICHARS) {
257 /* Write into the stack buffer; nallocated can't overflow.
258 * At the end, we'll allocate exactly as much heap space as it
259 * turns out we need.
260 */
261 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
262 result = NULL; /* will allocate after we're done */
263 p = stackbuf;
264 }
265 else {
266 if (size > PY_SSIZE_T_MAX / max_char_size) {
267 /* integer overflow */
268 return PyErr_NoMemory();
269 }
270 /* Overallocate on the heap, and give the excess back at the end. */
271 nallocated = size * max_char_size;
272 result = PyBytes_FromStringAndSize(NULL, nallocated);
273 if (result == NULL)
274 return NULL;
275 p = PyBytes_AS_STRING(result);
276 }
277
278 for (i = 0; i < size;) {
279 Py_UCS4 ch = data[i++];
280
281 if (ch < 0x80) {
282 /* Encode ASCII */
283 *p++ = (char) ch;
284
285 }
286 else
287#if STRINGLIB_SIZEOF_CHAR > 1
288 if (ch < 0x0800)
289#endif
290 {
291 /* Encode Latin-1 */
292 *p++ = (char)(0xc0 | (ch >> 6));
293 *p++ = (char)(0x80 | (ch & 0x3f));
294 }
295#if STRINGLIB_SIZEOF_CHAR > 1
296 else if (Py_UNICODE_IS_SURROGATE(ch)) {
297 Py_ssize_t newpos;
298 Py_ssize_t repsize, k, startpos;
299 startpos = i-1;
300 rep = unicode_encode_call_errorhandler(
301 errors, &errorHandler, "utf-8", "surrogates not allowed",
302 unicode, &exc, startpos, startpos+1, &newpos);
303 if (!rep)
304 goto error;
305
306 if (PyBytes_Check(rep))
307 repsize = PyBytes_GET_SIZE(rep);
308 else
309 repsize = PyUnicode_GET_LENGTH(rep);
310
311 if (repsize > max_char_size) {
312 Py_ssize_t offset;
313
314 if (result == NULL)
315 offset = p - stackbuf;
316 else
317 offset = p - PyBytes_AS_STRING(result);
318
319 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
320 /* integer overflow */
321 PyErr_NoMemory();
322 goto error;
323 }
324 nallocated += repsize - max_char_size;
325 if (result != NULL) {
326 if (_PyBytes_Resize(&result, nallocated) < 0)
327 goto error;
328 } else {
329 result = PyBytes_FromStringAndSize(NULL, nallocated);
330 if (result == NULL)
331 goto error;
332 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
333 }
334 p = PyBytes_AS_STRING(result) + offset;
335 }
336
337 if (PyBytes_Check(rep)) {
338 char *prep = PyBytes_AS_STRING(rep);
339 for(k = repsize; k > 0; k--)
340 *p++ = *prep++;
341 } else /* rep is unicode */ {
342 enum PyUnicode_Kind repkind;
343 void *repdata;
344
345 if (PyUnicode_READY(rep) < 0)
346 goto error;
347 repkind = PyUnicode_KIND(rep);
348 repdata = PyUnicode_DATA(rep);
349
350 for(k=0; k<repsize; k++) {
351 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
352 if (0x80 <= c) {
353 raise_encode_exception(&exc, "utf-8",
354 unicode,
355 i-1, i,
356 "surrogates not allowed");
357 goto error;
358 }
359 *p++ = (char)c;
360 }
361 }
362 Py_CLEAR(rep);
363 }
364 else
365#if STRINGLIB_SIZEOF_CHAR > 2
366 if (ch < 0x10000)
367#endif
368 {
369 *p++ = (char)(0xe0 | (ch >> 12));
370 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
371 *p++ = (char)(0x80 | (ch & 0x3f));
372 }
373#if STRINGLIB_SIZEOF_CHAR > 2
374 else /* ch >= 0x10000 */
375 {
376 assert(ch <= MAX_UNICODE);
377 /* Encode UCS4 Unicode ordinals */
378 *p++ = (char)(0xf0 | (ch >> 18));
379 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
380 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
381 *p++ = (char)(0x80 | (ch & 0x3f));
382 }
383#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
384#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
385 }
386
387 if (result == NULL) {
388 /* This was stack allocated. */
389 nneeded = p - stackbuf;
390 assert(nneeded <= nallocated);
391 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
392 }
393 else {
394 /* Cut back to size actually needed. */
395 nneeded = p - PyBytes_AS_STRING(result);
396 assert(nneeded <= nallocated);
397 _PyBytes_Resize(&result, nneeded);
398 }
399
400#if STRINGLIB_SIZEOF_CHAR > 1
401 Py_XDECREF(errorHandler);
402 Py_XDECREF(exc);
403#endif
404 return result;
405
406#if STRINGLIB_SIZEOF_CHAR > 1
407 error:
408 Py_XDECREF(rep);
409 Py_XDECREF(errorHandler);
410 Py_XDECREF(exc);
411 Py_XDECREF(result);
412 return NULL;
413#endif
414
415#undef MAX_SHORT_UNICHARS
416}
417
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100418#endif /* STRINGLIB_IS_UNICODE */