blob: 63bf648d78824e01440974a961d43f425adc780b [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
5/* Mask to check or force alignment of a pointer to C 'long' boundaries */
6#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
7
8/* Mask to quickly check whether a C 'long' contains a
9 non-ASCII, UTF8-encoded char. */
10#if (SIZEOF_LONG == 8)
11# define ASCII_CHAR_MASK 0x8080808080808080L
12#elif (SIZEOF_LONG == 4)
13# define ASCII_CHAR_MASK 0x80808080L
14#else
15# error C 'long' size should be either 4 or 8!
16#endif
17
Mark Dickinson106c4142012-06-23 21:45:14 +010018/* 10xxxxxx */
19#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
20
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020021Py_LOCAL_INLINE(Py_UCS4)
22STRINGLIB(utf8_decode)(const char **inptr, const char *end,
23 STRINGLIB_CHAR *dest,
24 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010025{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 Py_UCS4 ch;
27 const char *s = *inptr;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010028 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020032 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010033
34 if (ch < 0x80) {
35 /* Fast path for runs of ASCII characters. Given that common UTF-8
36 input will consist of an overwhelming majority of ASCII
37 characters, we try to optimize for this case by checking
38 as many characters as a C 'long' can contain.
39 First, check if we can do an aligned read, as most CPUs have
40 a penalty for unaligned reads.
41 */
42 if (!((size_t) s & LONG_PTR_MASK)) {
43 /* Help register allocation */
44 register const char *_s = s;
45 register STRINGLIB_CHAR *_p = p;
46 while (_s < aligned_end) {
47 /* Read a whole long at a time (either 4 or 8 bytes),
48 and do a fast unrolled copy if it only contains ASCII
49 characters. */
50 unsigned long value = *(unsigned long *) _s;
51 if (value & ASCII_CHAR_MASK)
52 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020053#ifdef BYTEORDER_IS_LITTLE_ENDIAN
54 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
55 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
56 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
57 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
58# if SIZEOF_LONG == 8
59 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
60 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
61 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
62 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
63# endif
64#else
65# if SIZEOF_LONG == 8
66 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
67 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
68 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
69 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
70 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
71 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
72 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
73 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
74# else
75 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
76 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
77 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
78 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
79# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010080#endif
81 _s += SIZEOF_LONG;
82 _p += SIZEOF_LONG;
83 }
84 s = _s;
85 p = _p;
86 if (s == end)
87 break;
88 ch = (unsigned char)*s;
89 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020090 if (ch < 0x80) {
91 s++;
92 *p++ = ch;
93 continue;
94 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010095 }
96
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
103
104 if (ch < 0xE0) {
105 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
106 Py_UCS4 ch2;
107 if (end - s < 2) {
108 /* unexpected end of data: the caller will decide whether
109 it's an error or not */
110 break;
111 }
112 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100113 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200114 /* invalid continuation byte */
115 goto InvalidContinuation;
116 ch = (ch << 6) + ch2 -
117 ((0xC0 << 6) + 0x80);
118 assert ((ch > 0x007F) && (ch <= 0x07FF));
119 s += 2;
120 if (STRINGLIB_MAX_CHAR <= 0x007F ||
121 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
122 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100123 *p++ = ch;
124 continue;
125 }
126
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200127 if (ch < 0xF0) {
128 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
129 Py_UCS4 ch2, ch3;
130 if (end - s < 3) {
131 /* unexpected end of data: the caller will decide whether
132 it's an error or not */
133 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100134 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200135 ch2 = (unsigned char)s[1];
136 ch3 = (unsigned char)s[2];
Mark Dickinson106c4142012-06-23 21:45:14 +0100137 if (!IS_CONTINUATION_BYTE(ch2) ||
138 !IS_CONTINUATION_BYTE(ch3)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 /* invalid continuation byte */
140 goto InvalidContinuation;
141 }
142 if (ch == 0xE0) {
143 if (ch2 < 0xA0)
144 /* invalid sequence
145 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
146 goto InvalidContinuation;
147 }
148 else if (ch == 0xED && ch2 > 0x9F) {
149 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
150 will result in surrogates in range D800-DFFF. Surrogates are
151 not valid UTF-8 so they are rejected.
152 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
153 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
154 goto InvalidContinuation;
155 }
156 ch = (ch << 12) + (ch2 << 6) + ch3 -
157 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100158 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
159 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200160 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
161 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
162 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100163 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200164 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100165 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166
167 if (ch < 0xF5) {
168 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
169 Py_UCS4 ch2, ch3, ch4;
170 if (end - s < 4) {
171 /* unexpected end of data: the caller will decide whether
172 it's an error or not */
173 break;
174 }
175 ch2 = (unsigned char)s[1];
176 ch3 = (unsigned char)s[2];
177 ch4 = (unsigned char)s[3];
Mark Dickinson106c4142012-06-23 21:45:14 +0100178 if (!IS_CONTINUATION_BYTE(ch2) ||
179 !IS_CONTINUATION_BYTE(ch3) ||
180 !IS_CONTINUATION_BYTE(ch4)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200181 /* invalid continuation byte */
182 goto InvalidContinuation;
183 }
184 if (ch == 0xF0) {
185 if (ch2 < 0x90)
186 /* invalid sequence
187 \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
188 goto InvalidContinuation;
189 }
190 else if (ch == 0xF4 && ch2 > 0x8F) {
191 /* invalid sequence
192 \xF4\x90\x80\80- -- 110000- overflow */
193 goto InvalidContinuation;
194 }
195 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
196 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
197 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
198 s += 4;
199 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
200 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
201 goto Overflow;
202 *p++ = ch;
203 continue;
204 }
205 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 ch = 0;
208Overflow:
209Return:
210 *inptr = s;
211 *outpos = p - dest;
212 return ch;
213InvalidStart:
214 ch = 1;
215 goto Return;
216InvalidContinuation:
217 ch = 2;
218 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100219}
220
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100221#undef ASCII_CHAR_MASK
Mark Dickinson106c4142012-06-23 21:45:14 +0100222#undef IS_CONTINUATION_BYTE
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100223
Victor Stinner6099a032011-12-18 14:22:26 +0100224
225/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
226 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
227 UCS-1 strings don't need to handle surrogates for example. */
228Py_LOCAL_INLINE(PyObject *)
229STRINGLIB(utf8_encoder)(PyObject *unicode,
230 STRINGLIB_CHAR *data,
231 Py_ssize_t size,
232 const char *errors)
233{
234#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
235
236 Py_ssize_t i; /* index into s of next input byte */
237 PyObject *result; /* result string object */
238 char *p; /* next free byte in output buffer */
239 Py_ssize_t nallocated; /* number of result bytes allocated */
240 Py_ssize_t nneeded; /* number of result bytes needed */
241#if STRINGLIB_SIZEOF_CHAR > 1
242 PyObject *errorHandler = NULL;
243 PyObject *exc = NULL;
244 PyObject *rep = NULL;
245#endif
246#if STRINGLIB_SIZEOF_CHAR == 1
247 const Py_ssize_t max_char_size = 2;
248 char stackbuf[MAX_SHORT_UNICHARS * 2];
249#elif STRINGLIB_SIZEOF_CHAR == 2
250 const Py_ssize_t max_char_size = 3;
251 char stackbuf[MAX_SHORT_UNICHARS * 3];
252#else /* STRINGLIB_SIZEOF_CHAR == 4 */
253 const Py_ssize_t max_char_size = 4;
254 char stackbuf[MAX_SHORT_UNICHARS * 4];
255#endif
256
257 assert(size >= 0);
258
259 if (size <= MAX_SHORT_UNICHARS) {
260 /* Write into the stack buffer; nallocated can't overflow.
261 * At the end, we'll allocate exactly as much heap space as it
262 * turns out we need.
263 */
264 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
265 result = NULL; /* will allocate after we're done */
266 p = stackbuf;
267 }
268 else {
269 if (size > PY_SSIZE_T_MAX / max_char_size) {
270 /* integer overflow */
271 return PyErr_NoMemory();
272 }
273 /* Overallocate on the heap, and give the excess back at the end. */
274 nallocated = size * max_char_size;
275 result = PyBytes_FromStringAndSize(NULL, nallocated);
276 if (result == NULL)
277 return NULL;
278 p = PyBytes_AS_STRING(result);
279 }
280
281 for (i = 0; i < size;) {
282 Py_UCS4 ch = data[i++];
283
284 if (ch < 0x80) {
285 /* Encode ASCII */
286 *p++ = (char) ch;
287
288 }
289 else
290#if STRINGLIB_SIZEOF_CHAR > 1
291 if (ch < 0x0800)
292#endif
293 {
294 /* Encode Latin-1 */
295 *p++ = (char)(0xc0 | (ch >> 6));
296 *p++ = (char)(0x80 | (ch & 0x3f));
297 }
298#if STRINGLIB_SIZEOF_CHAR > 1
299 else if (Py_UNICODE_IS_SURROGATE(ch)) {
300 Py_ssize_t newpos;
301 Py_ssize_t repsize, k, startpos;
302 startpos = i-1;
303 rep = unicode_encode_call_errorhandler(
304 errors, &errorHandler, "utf-8", "surrogates not allowed",
305 unicode, &exc, startpos, startpos+1, &newpos);
306 if (!rep)
307 goto error;
308
309 if (PyBytes_Check(rep))
310 repsize = PyBytes_GET_SIZE(rep);
311 else
312 repsize = PyUnicode_GET_LENGTH(rep);
313
314 if (repsize > max_char_size) {
315 Py_ssize_t offset;
316
317 if (result == NULL)
318 offset = p - stackbuf;
319 else
320 offset = p - PyBytes_AS_STRING(result);
321
322 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
323 /* integer overflow */
324 PyErr_NoMemory();
325 goto error;
326 }
327 nallocated += repsize - max_char_size;
328 if (result != NULL) {
329 if (_PyBytes_Resize(&result, nallocated) < 0)
330 goto error;
331 } else {
332 result = PyBytes_FromStringAndSize(NULL, nallocated);
333 if (result == NULL)
334 goto error;
335 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
336 }
337 p = PyBytes_AS_STRING(result) + offset;
338 }
339
340 if (PyBytes_Check(rep)) {
341 char *prep = PyBytes_AS_STRING(rep);
342 for(k = repsize; k > 0; k--)
343 *p++ = *prep++;
344 } else /* rep is unicode */ {
345 enum PyUnicode_Kind repkind;
346 void *repdata;
347
348 if (PyUnicode_READY(rep) < 0)
349 goto error;
350 repkind = PyUnicode_KIND(rep);
351 repdata = PyUnicode_DATA(rep);
352
353 for(k=0; k<repsize; k++) {
354 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
355 if (0x80 <= c) {
356 raise_encode_exception(&exc, "utf-8",
357 unicode,
358 i-1, i,
359 "surrogates not allowed");
360 goto error;
361 }
362 *p++ = (char)c;
363 }
364 }
365 Py_CLEAR(rep);
366 }
367 else
368#if STRINGLIB_SIZEOF_CHAR > 2
369 if (ch < 0x10000)
370#endif
371 {
372 *p++ = (char)(0xe0 | (ch >> 12));
373 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
374 *p++ = (char)(0x80 | (ch & 0x3f));
375 }
376#if STRINGLIB_SIZEOF_CHAR > 2
377 else /* ch >= 0x10000 */
378 {
379 assert(ch <= MAX_UNICODE);
380 /* Encode UCS4 Unicode ordinals */
381 *p++ = (char)(0xf0 | (ch >> 18));
382 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
383 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
384 *p++ = (char)(0x80 | (ch & 0x3f));
385 }
386#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
387#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
388 }
389
390 if (result == NULL) {
391 /* This was stack allocated. */
392 nneeded = p - stackbuf;
393 assert(nneeded <= nallocated);
394 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
395 }
396 else {
397 /* Cut back to size actually needed. */
398 nneeded = p - PyBytes_AS_STRING(result);
399 assert(nneeded <= nallocated);
400 _PyBytes_Resize(&result, nneeded);
401 }
402
403#if STRINGLIB_SIZEOF_CHAR > 1
404 Py_XDECREF(errorHandler);
405 Py_XDECREF(exc);
406#endif
407 return result;
408
409#if STRINGLIB_SIZEOF_CHAR > 1
410 error:
411 Py_XDECREF(rep);
412 Py_XDECREF(errorHandler);
413 Py_XDECREF(exc);
414 Py_XDECREF(result);
415 return NULL;
416#endif
417
418#undef MAX_SHORT_UNICHARS
419}
420
Antoine Pitrou63065d72012-05-15 23:48:04 +0200421/* The pattern for constructing UCS2-repeated masks. */
422#if SIZEOF_LONG == 8
423# define UCS2_REPEAT_MASK 0x0001000100010001ul
424#elif SIZEOF_LONG == 4
425# define UCS2_REPEAT_MASK 0x00010001ul
426#else
427# error C 'long' size should be either 4 or 8!
428#endif
429
430/* The mask for fast checking. */
431#if STRINGLIB_SIZEOF_CHAR == 1
432/* The mask for fast checking of whether a C 'long' contains a
433 non-ASCII or non-Latin1 UTF16-encoded characters. */
434# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
435#else
436/* The mask for fast checking of whether a C 'long' may contain
437 UTF16-encoded surrogate characters. This is an efficient heuristic,
438 assuming that non-surrogate characters with a code point >= 0x8000 are
439 rare in most input.
440*/
441# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
442#endif
443/* The mask for fast byte-swapping. */
444#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
445/* Swap bytes. */
446#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
447 (((value) & STRIPPED_MASK) << 8))
448
449Py_LOCAL_INLINE(Py_UCS4)
450STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
451 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
452 int native_ordering)
453{
454 Py_UCS4 ch;
455 const unsigned char *aligned_end =
456 (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
457 const unsigned char *q = *inptr;
458 STRINGLIB_CHAR *p = dest + *outpos;
459 /* Offsets from q for retrieving byte pairs in the right order. */
460#ifdef BYTEORDER_IS_LITTLE_ENDIAN
461 int ihi = !!native_ordering, ilo = !native_ordering;
462#else
463 int ihi = !native_ordering, ilo = !!native_ordering;
464#endif
465 --e;
466
467 while (q < e) {
468 Py_UCS4 ch2;
469 /* First check for possible aligned read of a C 'long'. Unaligned
470 reads are more expensive, better to defer to another iteration. */
471 if (!((size_t) q & LONG_PTR_MASK)) {
472 /* Fast path for runs of in-range non-surrogate chars. */
473 register const unsigned char *_q = q;
474 while (_q < aligned_end) {
475 unsigned long block = * (unsigned long *) _q;
476 if (native_ordering) {
477 /* Can use buffer directly */
478 if (block & FAST_CHAR_MASK)
479 break;
480 }
481 else {
482 /* Need to byte-swap */
483 if (block & SWAB(FAST_CHAR_MASK))
484 break;
485#if STRINGLIB_SIZEOF_CHAR == 1
486 block >>= 8;
487#else
488 block = SWAB(block);
489#endif
490 }
491#ifdef BYTEORDER_IS_LITTLE_ENDIAN
492# if SIZEOF_LONG == 4
493 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
494 p[1] = (STRINGLIB_CHAR)(block >> 16);
495# elif SIZEOF_LONG == 8
496 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
497 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
498 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
499 p[3] = (STRINGLIB_CHAR)(block >> 48);
500# endif
501#else
502# if SIZEOF_LONG == 4
503 p[0] = (STRINGLIB_CHAR)(block >> 16);
504 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
505# elif SIZEOF_LONG == 8
506 p[0] = (STRINGLIB_CHAR)(block >> 48);
507 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
508 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
509 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
510# endif
511#endif
512 _q += SIZEOF_LONG;
513 p += SIZEOF_LONG / 2;
514 }
515 q = _q;
516 if (q >= e)
517 break;
518 }
519
520 ch = (q[ihi] << 8) | q[ilo];
521 q += 2;
522 if (!Py_UNICODE_IS_SURROGATE(ch)) {
523#if STRINGLIB_SIZEOF_CHAR < 2
524 if (ch > STRINGLIB_MAX_CHAR)
525 /* Out-of-range */
526 goto Return;
527#endif
528 *p++ = (STRINGLIB_CHAR)ch;
529 continue;
530 }
531
532 /* UTF-16 code pair: */
533 if (q >= e)
534 goto UnexpectedEnd;
535 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
536 goto IllegalEncoding;
537 ch2 = (q[ihi] << 8) | q[ilo];
538 q += 2;
539 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
540 goto IllegalSurrogate;
541 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
542#if STRINGLIB_SIZEOF_CHAR < 4
543 /* Out-of-range */
544 goto Return;
545#else
546 *p++ = (STRINGLIB_CHAR)ch;
547#endif
548 }
549 ch = 0;
550Return:
551 *inptr = q;
552 *outpos = p - dest;
553 return ch;
554UnexpectedEnd:
555 ch = 1;
556 goto Return;
557IllegalEncoding:
558 ch = 2;
559 goto Return;
560IllegalSurrogate:
561 ch = 3;
562 goto Return;
563}
564#undef UCS2_REPEAT_MASK
565#undef FAST_CHAR_MASK
566#undef STRIPPED_MASK
567#undef SWAB
568#undef LONG_PTR_MASK
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200569
570
571Py_LOCAL_INLINE(void)
572STRINGLIB(utf16_encode)(unsigned short *out,
573 const STRINGLIB_CHAR *in,
574 Py_ssize_t len,
575 int native_ordering)
576{
577 const STRINGLIB_CHAR *end = in + len;
578#if STRINGLIB_SIZEOF_CHAR == 1
579# define SWAB2(CH) ((CH) << 8)
580#else
581# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
582#endif
583#if STRINGLIB_MAX_CHAR < 0x10000
584 if (native_ordering) {
585# if STRINGLIB_SIZEOF_CHAR == 2
586 Py_MEMCPY(out, in, 2 * len);
587# else
588 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
589# endif
590 } else {
591 const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3);
592 while (in < unrolled_end) {
593 out[0] = SWAB2(in[0]);
594 out[1] = SWAB2(in[1]);
595 out[2] = SWAB2(in[2]);
596 out[3] = SWAB2(in[3]);
597 in += 4; out += 4;
598 }
599 while (in < end) {
600 *out++ = SWAB2(*in);
601 ++in;
602 }
603 }
604#else
605 if (native_ordering) {
606 while (in < end) {
607 Py_UCS4 ch = *in++;
608 if (ch < 0x10000)
609 *out++ = ch;
610 else {
611 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
612 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
613 out += 2;
614 }
615 }
616 } else {
617 while (in < end) {
618 Py_UCS4 ch = *in++;
619 if (ch < 0x10000)
620 *out++ = SWAB2((Py_UCS2)ch);
621 else {
622 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
623 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
624 out[0] = SWAB2(ch1);
625 out[1] = SWAB2(ch2);
626 out += 2;
627 }
628 }
629 }
630#endif
631#undef SWAB2
632}
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100633#endif /* STRINGLIB_IS_UNICODE */