blob: 2a01089c0fcba89d75e61f4c8d9c2e9179011cce [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
41 register const char *_s = s;
42 register STRINGLIB_CHAR *_p = p;
43 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020050#ifdef BYTEORDER_IS_LITTLE_ENDIAN
51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xC2) {
95 /* invalid sequence
96 \x80-\xBF -- continuation byte
97 \xC0-\xC1 -- fake 0000-007F */
98 goto InvalidStart;
99 }
100
101 if (ch < 0xE0) {
102 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
103 Py_UCS4 ch2;
104 if (end - s < 2) {
105 /* unexpected end of data: the caller will decide whether
106 it's an error or not */
107 break;
108 }
109 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100110 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200111 /* invalid continuation byte */
112 goto InvalidContinuation;
113 ch = (ch << 6) + ch2 -
114 ((0xC0 << 6) + 0x80);
115 assert ((ch > 0x007F) && (ch <= 0x07FF));
116 s += 2;
117 if (STRINGLIB_MAX_CHAR <= 0x007F ||
118 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
119 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
130 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100131 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200132 ch2 = (unsigned char)s[1];
133 ch3 = (unsigned char)s[2];
Mark Dickinson106c4142012-06-23 21:45:14 +0100134 if (!IS_CONTINUATION_BYTE(ch2) ||
135 !IS_CONTINUATION_BYTE(ch3)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200136 /* invalid continuation byte */
137 goto InvalidContinuation;
138 }
139 if (ch == 0xE0) {
140 if (ch2 < 0xA0)
141 /* invalid sequence
142 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
143 goto InvalidContinuation;
144 }
145 else if (ch == 0xED && ch2 > 0x9F) {
146 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
147 will result in surrogates in range D800-DFFF. Surrogates are
148 not valid UTF-8 so they are rejected.
149 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
150 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
151 goto InvalidContinuation;
152 }
153 ch = (ch << 12) + (ch2 << 6) + ch3 -
154 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100155 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
156 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200157 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
158 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
159 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100160 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163
164 if (ch < 0xF5) {
165 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
166 Py_UCS4 ch2, ch3, ch4;
167 if (end - s < 4) {
168 /* unexpected end of data: the caller will decide whether
169 it's an error or not */
170 break;
171 }
172 ch2 = (unsigned char)s[1];
173 ch3 = (unsigned char)s[2];
174 ch4 = (unsigned char)s[3];
Mark Dickinson106c4142012-06-23 21:45:14 +0100175 if (!IS_CONTINUATION_BYTE(ch2) ||
176 !IS_CONTINUATION_BYTE(ch3) ||
177 !IS_CONTINUATION_BYTE(ch4)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200178 /* invalid continuation byte */
179 goto InvalidContinuation;
180 }
181 if (ch == 0xF0) {
182 if (ch2 < 0x90)
183 /* invalid sequence
184 \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
185 goto InvalidContinuation;
186 }
187 else if (ch == 0xF4 && ch2 > 0x8F) {
188 /* invalid sequence
189 \xF4\x90\x80\80- -- 110000- overflow */
190 goto InvalidContinuation;
191 }
192 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
193 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
194 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
195 s += 4;
196 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
197 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
198 goto Overflow;
199 *p++ = ch;
200 continue;
201 }
202 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100203 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200204 ch = 0;
205Overflow:
206Return:
207 *inptr = s;
208 *outpos = p - dest;
209 return ch;
210InvalidStart:
211 ch = 1;
212 goto Return;
213InvalidContinuation:
214 ch = 2;
215 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100216}
217
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100218#undef ASCII_CHAR_MASK
Mark Dickinson106c4142012-06-23 21:45:14 +0100219#undef IS_CONTINUATION_BYTE
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100220
Victor Stinner6099a032011-12-18 14:22:26 +0100221
222/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
223 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
224 UCS-1 strings don't need to handle surrogates for example. */
225Py_LOCAL_INLINE(PyObject *)
226STRINGLIB(utf8_encoder)(PyObject *unicode,
227 STRINGLIB_CHAR *data,
228 Py_ssize_t size,
229 const char *errors)
230{
231#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
232
233 Py_ssize_t i; /* index into s of next input byte */
234 PyObject *result; /* result string object */
235 char *p; /* next free byte in output buffer */
236 Py_ssize_t nallocated; /* number of result bytes allocated */
237 Py_ssize_t nneeded; /* number of result bytes needed */
238#if STRINGLIB_SIZEOF_CHAR > 1
239 PyObject *errorHandler = NULL;
240 PyObject *exc = NULL;
241 PyObject *rep = NULL;
242#endif
243#if STRINGLIB_SIZEOF_CHAR == 1
244 const Py_ssize_t max_char_size = 2;
245 char stackbuf[MAX_SHORT_UNICHARS * 2];
246#elif STRINGLIB_SIZEOF_CHAR == 2
247 const Py_ssize_t max_char_size = 3;
248 char stackbuf[MAX_SHORT_UNICHARS * 3];
249#else /* STRINGLIB_SIZEOF_CHAR == 4 */
250 const Py_ssize_t max_char_size = 4;
251 char stackbuf[MAX_SHORT_UNICHARS * 4];
252#endif
253
254 assert(size >= 0);
255
256 if (size <= MAX_SHORT_UNICHARS) {
257 /* Write into the stack buffer; nallocated can't overflow.
258 * At the end, we'll allocate exactly as much heap space as it
259 * turns out we need.
260 */
261 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
262 result = NULL; /* will allocate after we're done */
263 p = stackbuf;
264 }
265 else {
266 if (size > PY_SSIZE_T_MAX / max_char_size) {
267 /* integer overflow */
268 return PyErr_NoMemory();
269 }
270 /* Overallocate on the heap, and give the excess back at the end. */
271 nallocated = size * max_char_size;
272 result = PyBytes_FromStringAndSize(NULL, nallocated);
273 if (result == NULL)
274 return NULL;
275 p = PyBytes_AS_STRING(result);
276 }
277
278 for (i = 0; i < size;) {
279 Py_UCS4 ch = data[i++];
280
281 if (ch < 0x80) {
282 /* Encode ASCII */
283 *p++ = (char) ch;
284
285 }
286 else
287#if STRINGLIB_SIZEOF_CHAR > 1
288 if (ch < 0x0800)
289#endif
290 {
291 /* Encode Latin-1 */
292 *p++ = (char)(0xc0 | (ch >> 6));
293 *p++ = (char)(0x80 | (ch & 0x3f));
294 }
295#if STRINGLIB_SIZEOF_CHAR > 1
296 else if (Py_UNICODE_IS_SURROGATE(ch)) {
297 Py_ssize_t newpos;
298 Py_ssize_t repsize, k, startpos;
299 startpos = i-1;
300 rep = unicode_encode_call_errorhandler(
301 errors, &errorHandler, "utf-8", "surrogates not allowed",
302 unicode, &exc, startpos, startpos+1, &newpos);
303 if (!rep)
304 goto error;
305
306 if (PyBytes_Check(rep))
307 repsize = PyBytes_GET_SIZE(rep);
308 else
309 repsize = PyUnicode_GET_LENGTH(rep);
310
311 if (repsize > max_char_size) {
312 Py_ssize_t offset;
313
314 if (result == NULL)
315 offset = p - stackbuf;
316 else
317 offset = p - PyBytes_AS_STRING(result);
318
319 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
320 /* integer overflow */
321 PyErr_NoMemory();
322 goto error;
323 }
324 nallocated += repsize - max_char_size;
325 if (result != NULL) {
326 if (_PyBytes_Resize(&result, nallocated) < 0)
327 goto error;
328 } else {
329 result = PyBytes_FromStringAndSize(NULL, nallocated);
330 if (result == NULL)
331 goto error;
332 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
333 }
334 p = PyBytes_AS_STRING(result) + offset;
335 }
336
337 if (PyBytes_Check(rep)) {
338 char *prep = PyBytes_AS_STRING(rep);
339 for(k = repsize; k > 0; k--)
340 *p++ = *prep++;
341 } else /* rep is unicode */ {
342 enum PyUnicode_Kind repkind;
343 void *repdata;
344
345 if (PyUnicode_READY(rep) < 0)
346 goto error;
347 repkind = PyUnicode_KIND(rep);
348 repdata = PyUnicode_DATA(rep);
349
350 for(k=0; k<repsize; k++) {
351 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
352 if (0x80 <= c) {
353 raise_encode_exception(&exc, "utf-8",
354 unicode,
355 i-1, i,
356 "surrogates not allowed");
357 goto error;
358 }
359 *p++ = (char)c;
360 }
361 }
362 Py_CLEAR(rep);
363 }
364 else
365#if STRINGLIB_SIZEOF_CHAR > 2
366 if (ch < 0x10000)
367#endif
368 {
369 *p++ = (char)(0xe0 | (ch >> 12));
370 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
371 *p++ = (char)(0x80 | (ch & 0x3f));
372 }
373#if STRINGLIB_SIZEOF_CHAR > 2
374 else /* ch >= 0x10000 */
375 {
376 assert(ch <= MAX_UNICODE);
377 /* Encode UCS4 Unicode ordinals */
378 *p++ = (char)(0xf0 | (ch >> 18));
379 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
380 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
381 *p++ = (char)(0x80 | (ch & 0x3f));
382 }
383#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
384#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
385 }
386
387 if (result == NULL) {
388 /* This was stack allocated. */
389 nneeded = p - stackbuf;
390 assert(nneeded <= nallocated);
391 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
392 }
393 else {
394 /* Cut back to size actually needed. */
395 nneeded = p - PyBytes_AS_STRING(result);
396 assert(nneeded <= nallocated);
397 _PyBytes_Resize(&result, nneeded);
398 }
399
400#if STRINGLIB_SIZEOF_CHAR > 1
401 Py_XDECREF(errorHandler);
402 Py_XDECREF(exc);
403#endif
404 return result;
405
406#if STRINGLIB_SIZEOF_CHAR > 1
407 error:
408 Py_XDECREF(rep);
409 Py_XDECREF(errorHandler);
410 Py_XDECREF(exc);
411 Py_XDECREF(result);
412 return NULL;
413#endif
414
415#undef MAX_SHORT_UNICHARS
416}
417
Antoine Pitrou63065d72012-05-15 23:48:04 +0200418/* The pattern for constructing UCS2-repeated masks. */
419#if SIZEOF_LONG == 8
420# define UCS2_REPEAT_MASK 0x0001000100010001ul
421#elif SIZEOF_LONG == 4
422# define UCS2_REPEAT_MASK 0x00010001ul
423#else
424# error C 'long' size should be either 4 or 8!
425#endif
426
427/* The mask for fast checking. */
428#if STRINGLIB_SIZEOF_CHAR == 1
429/* The mask for fast checking of whether a C 'long' contains a
430 non-ASCII or non-Latin1 UTF16-encoded characters. */
431# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
432#else
433/* The mask for fast checking of whether a C 'long' may contain
434 UTF16-encoded surrogate characters. This is an efficient heuristic,
435 assuming that non-surrogate characters with a code point >= 0x8000 are
436 rare in most input.
437*/
438# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
439#endif
440/* The mask for fast byte-swapping. */
441#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
442/* Swap bytes. */
443#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
444 (((value) & STRIPPED_MASK) << 8))
445
446Py_LOCAL_INLINE(Py_UCS4)
447STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
448 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
449 int native_ordering)
450{
451 Py_UCS4 ch;
452 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200453 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200454 const unsigned char *q = *inptr;
455 STRINGLIB_CHAR *p = dest + *outpos;
456 /* Offsets from q for retrieving byte pairs in the right order. */
457#ifdef BYTEORDER_IS_LITTLE_ENDIAN
458 int ihi = !!native_ordering, ilo = !native_ordering;
459#else
460 int ihi = !native_ordering, ilo = !!native_ordering;
461#endif
462 --e;
463
464 while (q < e) {
465 Py_UCS4 ch2;
466 /* First check for possible aligned read of a C 'long'. Unaligned
467 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200468 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200469 /* Fast path for runs of in-range non-surrogate chars. */
470 register const unsigned char *_q = q;
471 while (_q < aligned_end) {
472 unsigned long block = * (unsigned long *) _q;
473 if (native_ordering) {
474 /* Can use buffer directly */
475 if (block & FAST_CHAR_MASK)
476 break;
477 }
478 else {
479 /* Need to byte-swap */
480 if (block & SWAB(FAST_CHAR_MASK))
481 break;
482#if STRINGLIB_SIZEOF_CHAR == 1
483 block >>= 8;
484#else
485 block = SWAB(block);
486#endif
487 }
488#ifdef BYTEORDER_IS_LITTLE_ENDIAN
489# if SIZEOF_LONG == 4
490 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
491 p[1] = (STRINGLIB_CHAR)(block >> 16);
492# elif SIZEOF_LONG == 8
493 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
494 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
495 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
496 p[3] = (STRINGLIB_CHAR)(block >> 48);
497# endif
498#else
499# if SIZEOF_LONG == 4
500 p[0] = (STRINGLIB_CHAR)(block >> 16);
501 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
502# elif SIZEOF_LONG == 8
503 p[0] = (STRINGLIB_CHAR)(block >> 48);
504 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
505 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
506 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
507# endif
508#endif
509 _q += SIZEOF_LONG;
510 p += SIZEOF_LONG / 2;
511 }
512 q = _q;
513 if (q >= e)
514 break;
515 }
516
517 ch = (q[ihi] << 8) | q[ilo];
518 q += 2;
519 if (!Py_UNICODE_IS_SURROGATE(ch)) {
520#if STRINGLIB_SIZEOF_CHAR < 2
521 if (ch > STRINGLIB_MAX_CHAR)
522 /* Out-of-range */
523 goto Return;
524#endif
525 *p++ = (STRINGLIB_CHAR)ch;
526 continue;
527 }
528
529 /* UTF-16 code pair: */
530 if (q >= e)
531 goto UnexpectedEnd;
532 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
533 goto IllegalEncoding;
534 ch2 = (q[ihi] << 8) | q[ilo];
535 q += 2;
536 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
537 goto IllegalSurrogate;
538 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
539#if STRINGLIB_SIZEOF_CHAR < 4
540 /* Out-of-range */
541 goto Return;
542#else
543 *p++ = (STRINGLIB_CHAR)ch;
544#endif
545 }
546 ch = 0;
547Return:
548 *inptr = q;
549 *outpos = p - dest;
550 return ch;
551UnexpectedEnd:
552 ch = 1;
553 goto Return;
554IllegalEncoding:
555 ch = 2;
556 goto Return;
557IllegalSurrogate:
558 ch = 3;
559 goto Return;
560}
561#undef UCS2_REPEAT_MASK
562#undef FAST_CHAR_MASK
563#undef STRIPPED_MASK
564#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200565
566
567Py_LOCAL_INLINE(void)
568STRINGLIB(utf16_encode)(unsigned short *out,
569 const STRINGLIB_CHAR *in,
570 Py_ssize_t len,
571 int native_ordering)
572{
573 const STRINGLIB_CHAR *end = in + len;
574#if STRINGLIB_SIZEOF_CHAR == 1
575# define SWAB2(CH) ((CH) << 8)
576#else
577# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
578#endif
579#if STRINGLIB_MAX_CHAR < 0x10000
580 if (native_ordering) {
581# if STRINGLIB_SIZEOF_CHAR == 2
582 Py_MEMCPY(out, in, 2 * len);
583# else
584 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
585# endif
586 } else {
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200587 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200588 while (in < unrolled_end) {
589 out[0] = SWAB2(in[0]);
590 out[1] = SWAB2(in[1]);
591 out[2] = SWAB2(in[2]);
592 out[3] = SWAB2(in[3]);
593 in += 4; out += 4;
594 }
595 while (in < end) {
596 *out++ = SWAB2(*in);
597 ++in;
598 }
599 }
600#else
601 if (native_ordering) {
602 while (in < end) {
603 Py_UCS4 ch = *in++;
604 if (ch < 0x10000)
605 *out++ = ch;
606 else {
607 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
608 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
609 out += 2;
610 }
611 }
612 } else {
613 while (in < end) {
614 Py_UCS4 ch = *in++;
615 if (ch < 0x10000)
616 *out++ = SWAB2((Py_UCS2)ch);
617 else {
618 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
619 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
620 out[0] = SWAB2(ch1);
621 out[1] = SWAB2(ch2);
622 out += 2;
623 }
624 }
625 }
626#endif
627#undef SWAB2
628}
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100629#endif /* STRINGLIB_IS_UNICODE */