blob: 562191c18e9ba69392eb3f6cc5365d2f4121615b [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020041 const char *_s = s;
42 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010043 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020050#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020051 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xE0) {
95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010096 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200103 if (end - s < 2) {
104 /* unexpected end of data: the caller will decide whether
105 it's an error or not */
106 break;
107 }
108 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100109 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200110 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200111 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 ch = (ch << 6) + ch2 -
113 ((0xC0 << 6) + 0x80);
114 assert ((ch > 0x007F) && (ch <= 0x07FF));
115 s += 2;
116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200118 /* Out-of-range */
119 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200137 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 ch2 = (unsigned char)s[1];
140 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200141 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200142 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 }
145 if (ch == 0xE0) {
146 if (ch2 < 0xA0)
147 /* invalid sequence
148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200149 goto InvalidContinuation1;
150 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
152 will result in surrogates in range D800-DFFF. Surrogates are
153 not valid UTF-8 so they are rejected.
154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 }
162 ch = (ch << 12) + (ch2 << 6) + ch3 -
163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
165 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200168 /* Out-of-range */
169 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100170 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200171 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173
174 if (ch < 0xF5) {
175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
176 Py_UCS4 ch2, ch3, ch4;
177 if (end - s < 4) {
178 /* unexpected end of data: the caller will decide whether
179 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200192 break;
193 }
194 ch2 = (unsigned char)s[1];
195 ch3 = (unsigned char)s[2];
196 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200197 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200198 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 }
201 if (ch == 0xF0) {
202 if (ch2 < 0x90)
203 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
205 goto InvalidContinuation1;
206 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 /* invalid sequence
208 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200218 }
219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
222 s += 4;
223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200225 /* Out-of-range */
226 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200227 *p++ = ch;
228 continue;
229 }
230 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200232 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200233Return:
234 *inptr = s;
235 *outpos = p - dest;
236 return ch;
237InvalidStart:
238 ch = 1;
239 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200240InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200241 ch = 2;
242 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200243InvalidContinuation2:
244 ch = 3;
245 goto Return;
246InvalidContinuation3:
247 ch = 4;
248 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100249}
250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251#undef ASCII_CHAR_MASK
252
Victor Stinner6099a032011-12-18 14:22:26 +0100253
254/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
256 UCS-1 strings don't need to handle surrogates for example. */
257Py_LOCAL_INLINE(PyObject *)
258STRINGLIB(utf8_encoder)(PyObject *unicode,
259 STRINGLIB_CHAR *data,
260 Py_ssize_t size,
261 const char *errors)
262{
263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
264
265 Py_ssize_t i; /* index into s of next input byte */
266 PyObject *result; /* result string object */
267 char *p; /* next free byte in output buffer */
268 Py_ssize_t nallocated; /* number of result bytes allocated */
269 Py_ssize_t nneeded; /* number of result bytes needed */
270#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200271 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100272 PyObject *exc = NULL;
273 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200274 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100275#endif
276#if STRINGLIB_SIZEOF_CHAR == 1
277 const Py_ssize_t max_char_size = 2;
278 char stackbuf[MAX_SHORT_UNICHARS * 2];
279#elif STRINGLIB_SIZEOF_CHAR == 2
280 const Py_ssize_t max_char_size = 3;
281 char stackbuf[MAX_SHORT_UNICHARS * 3];
282#else /* STRINGLIB_SIZEOF_CHAR == 4 */
283 const Py_ssize_t max_char_size = 4;
284 char stackbuf[MAX_SHORT_UNICHARS * 4];
285#endif
286
287 assert(size >= 0);
288
289 if (size <= MAX_SHORT_UNICHARS) {
290 /* Write into the stack buffer; nallocated can't overflow.
291 * At the end, we'll allocate exactly as much heap space as it
292 * turns out we need.
293 */
294 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
295 result = NULL; /* will allocate after we're done */
296 p = stackbuf;
297 }
298 else {
299 if (size > PY_SSIZE_T_MAX / max_char_size) {
300 /* integer overflow */
301 return PyErr_NoMemory();
302 }
303 /* Overallocate on the heap, and give the excess back at the end. */
304 nallocated = size * max_char_size;
305 result = PyBytes_FromStringAndSize(NULL, nallocated);
306 if (result == NULL)
307 return NULL;
308 p = PyBytes_AS_STRING(result);
309 }
310
311 for (i = 0; i < size;) {
312 Py_UCS4 ch = data[i++];
313
314 if (ch < 0x80) {
315 /* Encode ASCII */
316 *p++ = (char) ch;
317
318 }
319 else
320#if STRINGLIB_SIZEOF_CHAR > 1
321 if (ch < 0x0800)
322#endif
323 {
324 /* Encode Latin-1 */
325 *p++ = (char)(0xc0 | (ch >> 6));
326 *p++ = (char)(0x80 | (ch & 0x3f));
327 }
328#if STRINGLIB_SIZEOF_CHAR > 1
329 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200330 Py_ssize_t startpos, endpos, newpos;
331 Py_ssize_t repsize, k;
332 if (error_handler == _Py_ERROR_UNKNOWN)
333 error_handler = get_error_handler(errors);
334
Victor Stinner6099a032011-12-18 14:22:26 +0100335 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200336 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100337
Victor Stinner01ada392015-10-01 21:54:51 +0200338 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
339 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100340
Victor Stinner01ada392015-10-01 21:54:51 +0200341 switch (error_handler)
342 {
343 case _Py_ERROR_REPLACE:
344 memset(p, '?', endpos - startpos);
345 p += (endpos - startpos);
346 /* fall through the ignore handler */
347 case _Py_ERROR_IGNORE:
348 i += (endpos - startpos - 1);
349 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100350
Victor Stinner01ada392015-10-01 21:54:51 +0200351
352 case _Py_ERROR_SURROGATEPASS:
353 for (k=startpos; k<endpos; k++) {
354 ch = data[k];
355 *p++ = (char)(0xe0 | (ch >> 12));
356 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
357 *p++ = (char)(0x80 | (ch & 0x3f));
358 }
359 i += (endpos - startpos - 1);
360 break;
361
362 case _Py_ERROR_SURROGATEESCAPE:
363 for (k=startpos; k<endpos; k++) {
364 ch = data[k];
365 if (!(0xDC80 <= ch && ch <= 0xDCFF))
366 break;
367 *p++ = (char)(ch & 0xff);
368 }
369 if (k >= endpos) {
370 i += (endpos - startpos - 1);
371 break;
372 }
373 startpos = k;
374 assert(startpos < endpos);
375 /* fall through the default handler */
376
377 default:
378 rep = unicode_encode_call_errorhandler(
379 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
380 unicode, &exc, startpos, endpos, &newpos);
381 if (!rep)
382 goto error;
383
384 if (PyBytes_Check(rep))
385 repsize = PyBytes_GET_SIZE(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100386 else
Victor Stinner01ada392015-10-01 21:54:51 +0200387 repsize = PyUnicode_GET_LENGTH(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100388
Victor Stinner01ada392015-10-01 21:54:51 +0200389 if (repsize > max_char_size) {
390 Py_ssize_t offset;
391
Victor Stinner6099a032011-12-18 14:22:26 +0100392 if (result == NULL)
Victor Stinner01ada392015-10-01 21:54:51 +0200393 offset = p - stackbuf;
394 else
395 offset = p - PyBytes_AS_STRING(result);
396
397 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
398 /* integer overflow */
399 PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100400 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200401 }
402 nallocated += repsize - max_char_size;
403 if (result != NULL) {
404 if (_PyBytes_Resize(&result, nallocated) < 0)
405 goto error;
406 } else {
407 result = PyBytes_FromStringAndSize(NULL, nallocated);
408 if (result == NULL)
409 goto error;
410 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
411 }
412 p = PyBytes_AS_STRING(result) + offset;
Victor Stinner6099a032011-12-18 14:22:26 +0100413 }
Victor Stinner6099a032011-12-18 14:22:26 +0100414
Victor Stinner01ada392015-10-01 21:54:51 +0200415 if (PyBytes_Check(rep)) {
416 memcpy(p, PyBytes_AS_STRING(rep), repsize);
417 p += repsize;
418 }
419 else {
420 /* rep is unicode */
421 if (PyUnicode_READY(rep) < 0)
422 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100423
Victor Stinner01ada392015-10-01 21:54:51 +0200424 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100425 raise_encode_exception(&exc, "utf-8",
426 unicode,
427 i-1, i,
428 "surrogates not allowed");
429 goto error;
430 }
Victor Stinner01ada392015-10-01 21:54:51 +0200431
432 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
433 memcpy(p, PyUnicode_DATA(rep), repsize);
434 p += repsize;
Victor Stinner6099a032011-12-18 14:22:26 +0100435 }
Victor Stinner01ada392015-10-01 21:54:51 +0200436 Py_CLEAR(rep);
437
438 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100439 }
Victor Stinner6099a032011-12-18 14:22:26 +0100440 }
441 else
442#if STRINGLIB_SIZEOF_CHAR > 2
443 if (ch < 0x10000)
444#endif
445 {
446 *p++ = (char)(0xe0 | (ch >> 12));
447 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
448 *p++ = (char)(0x80 | (ch & 0x3f));
449 }
450#if STRINGLIB_SIZEOF_CHAR > 2
451 else /* ch >= 0x10000 */
452 {
453 assert(ch <= MAX_UNICODE);
454 /* Encode UCS4 Unicode ordinals */
455 *p++ = (char)(0xf0 | (ch >> 18));
456 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
457 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
458 *p++ = (char)(0x80 | (ch & 0x3f));
459 }
460#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
461#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
462 }
463
464 if (result == NULL) {
465 /* This was stack allocated. */
466 nneeded = p - stackbuf;
467 assert(nneeded <= nallocated);
468 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
469 }
470 else {
471 /* Cut back to size actually needed. */
472 nneeded = p - PyBytes_AS_STRING(result);
473 assert(nneeded <= nallocated);
474 _PyBytes_Resize(&result, nneeded);
475 }
476
477#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200478 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100479 Py_XDECREF(exc);
480#endif
481 return result;
482
483#if STRINGLIB_SIZEOF_CHAR > 1
484 error:
485 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200486 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100487 Py_XDECREF(exc);
488 Py_XDECREF(result);
489 return NULL;
490#endif
491
492#undef MAX_SHORT_UNICHARS
493}
494
Antoine Pitrou63065d72012-05-15 23:48:04 +0200495/* The pattern for constructing UCS2-repeated masks. */
496#if SIZEOF_LONG == 8
497# define UCS2_REPEAT_MASK 0x0001000100010001ul
498#elif SIZEOF_LONG == 4
499# define UCS2_REPEAT_MASK 0x00010001ul
500#else
501# error C 'long' size should be either 4 or 8!
502#endif
503
504/* The mask for fast checking. */
505#if STRINGLIB_SIZEOF_CHAR == 1
506/* The mask for fast checking of whether a C 'long' contains a
507 non-ASCII or non-Latin1 UTF16-encoded characters. */
508# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
509#else
510/* The mask for fast checking of whether a C 'long' may contain
511 UTF16-encoded surrogate characters. This is an efficient heuristic,
512 assuming that non-surrogate characters with a code point >= 0x8000 are
513 rare in most input.
514*/
515# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
516#endif
517/* The mask for fast byte-swapping. */
518#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
519/* Swap bytes. */
520#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
521 (((value) & STRIPPED_MASK) << 8))
522
523Py_LOCAL_INLINE(Py_UCS4)
524STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
525 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
526 int native_ordering)
527{
528 Py_UCS4 ch;
529 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200530 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200531 const unsigned char *q = *inptr;
532 STRINGLIB_CHAR *p = dest + *outpos;
533 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200534#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200535 int ihi = !!native_ordering, ilo = !native_ordering;
536#else
537 int ihi = !native_ordering, ilo = !!native_ordering;
538#endif
539 --e;
540
541 while (q < e) {
542 Py_UCS4 ch2;
543 /* First check for possible aligned read of a C 'long'. Unaligned
544 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200545 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200546 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200547 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200548 while (_q < aligned_end) {
549 unsigned long block = * (unsigned long *) _q;
550 if (native_ordering) {
551 /* Can use buffer directly */
552 if (block & FAST_CHAR_MASK)
553 break;
554 }
555 else {
556 /* Need to byte-swap */
557 if (block & SWAB(FAST_CHAR_MASK))
558 break;
559#if STRINGLIB_SIZEOF_CHAR == 1
560 block >>= 8;
561#else
562 block = SWAB(block);
563#endif
564 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200565#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200566# if SIZEOF_LONG == 4
567 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
568 p[1] = (STRINGLIB_CHAR)(block >> 16);
569# elif SIZEOF_LONG == 8
570 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
571 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
572 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
573 p[3] = (STRINGLIB_CHAR)(block >> 48);
574# endif
575#else
576# if SIZEOF_LONG == 4
577 p[0] = (STRINGLIB_CHAR)(block >> 16);
578 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
579# elif SIZEOF_LONG == 8
580 p[0] = (STRINGLIB_CHAR)(block >> 48);
581 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
582 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
583 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
584# endif
585#endif
586 _q += SIZEOF_LONG;
587 p += SIZEOF_LONG / 2;
588 }
589 q = _q;
590 if (q >= e)
591 break;
592 }
593
594 ch = (q[ihi] << 8) | q[ilo];
595 q += 2;
596 if (!Py_UNICODE_IS_SURROGATE(ch)) {
597#if STRINGLIB_SIZEOF_CHAR < 2
598 if (ch > STRINGLIB_MAX_CHAR)
599 /* Out-of-range */
600 goto Return;
601#endif
602 *p++ = (STRINGLIB_CHAR)ch;
603 continue;
604 }
605
606 /* UTF-16 code pair: */
607 if (q >= e)
608 goto UnexpectedEnd;
609 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
610 goto IllegalEncoding;
611 ch2 = (q[ihi] << 8) | q[ilo];
612 q += 2;
613 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
614 goto IllegalSurrogate;
615 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
616#if STRINGLIB_SIZEOF_CHAR < 4
617 /* Out-of-range */
618 goto Return;
619#else
620 *p++ = (STRINGLIB_CHAR)ch;
621#endif
622 }
623 ch = 0;
624Return:
625 *inptr = q;
626 *outpos = p - dest;
627 return ch;
628UnexpectedEnd:
629 ch = 1;
630 goto Return;
631IllegalEncoding:
632 ch = 2;
633 goto Return;
634IllegalSurrogate:
635 ch = 3;
636 goto Return;
637}
638#undef UCS2_REPEAT_MASK
639#undef FAST_CHAR_MASK
640#undef STRIPPED_MASK
641#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200642
643
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200644#if STRINGLIB_MAX_CHAR >= 0x80
645Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200646STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
647 Py_ssize_t len,
648 unsigned short **outptr,
649 int native_ordering)
650{
651 unsigned short *out = *outptr;
652 const STRINGLIB_CHAR *end = in + len;
653#if STRINGLIB_SIZEOF_CHAR == 1
654 if (native_ordering) {
655 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
656 while (in < unrolled_end) {
657 out[0] = in[0];
658 out[1] = in[1];
659 out[2] = in[2];
660 out[3] = in[3];
661 in += 4; out += 4;
662 }
663 while (in < end) {
664 *out++ = *in++;
665 }
666 } else {
667# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200668 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200669 while (in < unrolled_end) {
670 out[0] = SWAB2(in[0]);
671 out[1] = SWAB2(in[1]);
672 out[2] = SWAB2(in[2]);
673 out[3] = SWAB2(in[3]);
674 in += 4; out += 4;
675 }
676 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200677 Py_UCS4 ch = *in++;
678 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200679 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200680#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200681 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200682 *outptr = out;
683 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200684#else
685 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200686#if STRINGLIB_MAX_CHAR < 0x10000
687 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
688 while (in < unrolled_end) {
689 /* check if any character is a surrogate character */
690 if (((in[0] ^ 0xd800) &
691 (in[1] ^ 0xd800) &
692 (in[2] ^ 0xd800) &
693 (in[3] ^ 0xd800) & 0xf800) == 0)
694 break;
695 out[0] = in[0];
696 out[1] = in[1];
697 out[2] = in[2];
698 out[3] = in[3];
699 in += 4; out += 4;
700 }
701#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200702 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200703 Py_UCS4 ch;
704 ch = *in++;
705 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200706 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200707 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300708 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200709 goto fail;
710#if STRINGLIB_MAX_CHAR >= 0x10000
711 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200712 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
713 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
714 out += 2;
715 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716#endif
717 else
718 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200719 }
720 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200721#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
722#if STRINGLIB_MAX_CHAR < 0x10000
723 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
724 while (in < unrolled_end) {
725 /* check if any character is a surrogate character */
726 if (((in[0] ^ 0xd800) &
727 (in[1] ^ 0xd800) &
728 (in[2] ^ 0xd800) &
729 (in[3] ^ 0xd800) & 0xf800) == 0)
730 break;
731 out[0] = SWAB2(in[0]);
732 out[1] = SWAB2(in[1]);
733 out[2] = SWAB2(in[2]);
734 out[3] = SWAB2(in[3]);
735 in += 4; out += 4;
736 }
737#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200738 while (in < end) {
739 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200740 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200741 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200742 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300743 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200744 goto fail;
745#if STRINGLIB_MAX_CHAR >= 0x10000
746 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200747 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
748 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
749 out[0] = SWAB2(ch1);
750 out[1] = SWAB2(ch2);
751 out += 2;
752 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200753#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200754 else
755 *out++ = SWAB2((Py_UCS2)ch);
756 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200757#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200758 }
759 *outptr = out;
760 return len;
761 fail:
762 *outptr = out;
763 return len - (end - in + 1);
764#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200765}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300766
767#if STRINGLIB_SIZEOF_CHAR == 1
768# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
769#elif STRINGLIB_SIZEOF_CHAR == 2
770# define SWAB4(CH, tmp) (tmp = (CH), \
771 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
772 /* high bytes are zero */
773#else
774# define SWAB4(CH, tmp) (tmp = (CH), \
775 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
776 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
777#endif
778Py_LOCAL_INLINE(Py_ssize_t)
779STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
780 Py_ssize_t len,
781 PY_UINT32_T **outptr,
782 int native_ordering)
783{
784 PY_UINT32_T *out = *outptr;
785 const STRINGLIB_CHAR *end = in + len;
786 if (native_ordering) {
787 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
788 while (in < unrolled_end) {
789#if STRINGLIB_SIZEOF_CHAR > 1
790 /* check if any character is a surrogate character */
791 if (((in[0] ^ 0xd800) &
792 (in[1] ^ 0xd800) &
793 (in[2] ^ 0xd800) &
794 (in[3] ^ 0xd800) & 0xf800) == 0)
795 break;
796#endif
797 out[0] = in[0];
798 out[1] = in[1];
799 out[2] = in[2];
800 out[3] = in[3];
801 in += 4; out += 4;
802 }
803 while (in < end) {
804 Py_UCS4 ch;
805 ch = *in++;
806#if STRINGLIB_SIZEOF_CHAR > 1
807 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300808 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300809 goto fail;
810 }
811#endif
812 *out++ = ch;
813 }
814 } else {
815 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
816 while (in < unrolled_end) {
817#if STRINGLIB_SIZEOF_CHAR > 1
818 Py_UCS4 ch1, ch2, ch3, ch4;
819 /* check if any character is a surrogate character */
820 if (((in[0] ^ 0xd800) &
821 (in[1] ^ 0xd800) &
822 (in[2] ^ 0xd800) &
823 (in[3] ^ 0xd800) & 0xf800) == 0)
824 break;
825#endif
826 out[0] = SWAB4(in[0], ch1);
827 out[1] = SWAB4(in[1], ch2);
828 out[2] = SWAB4(in[2], ch3);
829 out[3] = SWAB4(in[3], ch4);
830 in += 4; out += 4;
831 }
832 while (in < end) {
833 Py_UCS4 ch = *in++;
834#if STRINGLIB_SIZEOF_CHAR > 1
835 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300836 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300837 goto fail;
838 }
839#endif
840 *out++ = SWAB4(ch, ch);
841 }
842 }
843 *outptr = out;
844 return len;
845#if STRINGLIB_SIZEOF_CHAR > 1
846 fail:
847 *outptr = out;
848 return len - (end - in + 1);
849#endif
850}
851#undef SWAB4
852
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200853#endif
854
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100855#endif /* STRINGLIB_IS_UNICODE */