blob: fb35493b1b915836a818912eb273d5cd2339df77 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
5/* Mask to check or force alignment of a pointer to C 'long' boundaries */
6#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
7
8/* Mask to quickly check whether a C 'long' contains a
9 non-ASCII, UTF8-encoded char. */
10#if (SIZEOF_LONG == 8)
11# define ASCII_CHAR_MASK 0x8080808080808080L
12#elif (SIZEOF_LONG == 4)
13# define ASCII_CHAR_MASK 0x80808080L
14#else
15# error C 'long' size should be either 4 or 8!
16#endif
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010025 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
39 if (!((size_t) s & LONG_PTR_MASK)) {
40 /* Help register allocation */
41 register const char *_s = s;
42 register STRINGLIB_CHAR *_p = p;
43 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020050#ifdef BYTEORDER_IS_LITTLE_ENDIAN
51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xC2) {
95 /* invalid sequence
96 \x80-\xBF -- continuation byte
97 \xC0-\xC1 -- fake 0000-007F */
98 goto InvalidStart;
99 }
100
101 if (ch < 0xE0) {
102 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
103 Py_UCS4 ch2;
104 if (end - s < 2) {
105 /* unexpected end of data: the caller will decide whether
106 it's an error or not */
107 break;
108 }
109 ch2 = (unsigned char)s[1];
110 if ((ch2 & 0xC0) != 0x80)
111 /* invalid continuation byte */
112 goto InvalidContinuation;
113 ch = (ch << 6) + ch2 -
114 ((0xC0 << 6) + 0x80);
115 assert ((ch > 0x007F) && (ch <= 0x07FF));
116 s += 2;
117 if (STRINGLIB_MAX_CHAR <= 0x007F ||
118 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
119 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
130 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100131 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200132 ch2 = (unsigned char)s[1];
133 ch3 = (unsigned char)s[2];
134 if ((ch2 & 0xC0) != 0x80 ||
135 (ch3 & 0xC0) != 0x80) {
136 /* invalid continuation byte */
137 goto InvalidContinuation;
138 }
139 if (ch == 0xE0) {
140 if (ch2 < 0xA0)
141 /* invalid sequence
142 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
143 goto InvalidContinuation;
144 }
145 else if (ch == 0xED && ch2 > 0x9F) {
146 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
147 will result in surrogates in range D800-DFFF. Surrogates are
148 not valid UTF-8 so they are rejected.
149 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
150 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
151 goto InvalidContinuation;
152 }
153 ch = (ch << 12) + (ch2 << 6) + ch3 -
154 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100155 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
156 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200157 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
158 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
159 goto Overflow;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100160 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163
164 if (ch < 0xF5) {
165 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
166 Py_UCS4 ch2, ch3, ch4;
167 if (end - s < 4) {
168 /* unexpected end of data: the caller will decide whether
169 it's an error or not */
170 break;
171 }
172 ch2 = (unsigned char)s[1];
173 ch3 = (unsigned char)s[2];
174 ch4 = (unsigned char)s[3];
175 if ((ch2 & 0xC0) != 0x80 ||
176 (ch3 & 0xC0) != 0x80 ||
177 (ch4 & 0xC0) != 0x80) {
178 /* invalid continuation byte */
179 goto InvalidContinuation;
180 }
181 if (ch == 0xF0) {
182 if (ch2 < 0x90)
183 /* invalid sequence
184 \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
185 goto InvalidContinuation;
186 }
187 else if (ch == 0xF4 && ch2 > 0x8F) {
188 /* invalid sequence
189 \xF4\x90\x80\80- -- 110000- overflow */
190 goto InvalidContinuation;
191 }
192 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
193 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
194 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
195 s += 4;
196 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
197 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
198 goto Overflow;
199 *p++ = ch;
200 continue;
201 }
202 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100203 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200204 ch = 0;
205Overflow:
206Return:
207 *inptr = s;
208 *outpos = p - dest;
209 return ch;
210InvalidStart:
211 ch = 1;
212 goto Return;
213InvalidContinuation:
214 ch = 2;
215 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100216}
217
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100218#undef ASCII_CHAR_MASK
219
Victor Stinner6099a032011-12-18 14:22:26 +0100220
221/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
222 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
223 UCS-1 strings don't need to handle surrogates for example. */
224Py_LOCAL_INLINE(PyObject *)
225STRINGLIB(utf8_encoder)(PyObject *unicode,
226 STRINGLIB_CHAR *data,
227 Py_ssize_t size,
228 const char *errors)
229{
230#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
231
232 Py_ssize_t i; /* index into s of next input byte */
233 PyObject *result; /* result string object */
234 char *p; /* next free byte in output buffer */
235 Py_ssize_t nallocated; /* number of result bytes allocated */
236 Py_ssize_t nneeded; /* number of result bytes needed */
237#if STRINGLIB_SIZEOF_CHAR > 1
238 PyObject *errorHandler = NULL;
239 PyObject *exc = NULL;
240 PyObject *rep = NULL;
241#endif
242#if STRINGLIB_SIZEOF_CHAR == 1
243 const Py_ssize_t max_char_size = 2;
244 char stackbuf[MAX_SHORT_UNICHARS * 2];
245#elif STRINGLIB_SIZEOF_CHAR == 2
246 const Py_ssize_t max_char_size = 3;
247 char stackbuf[MAX_SHORT_UNICHARS * 3];
248#else /* STRINGLIB_SIZEOF_CHAR == 4 */
249 const Py_ssize_t max_char_size = 4;
250 char stackbuf[MAX_SHORT_UNICHARS * 4];
251#endif
252
253 assert(size >= 0);
254
255 if (size <= MAX_SHORT_UNICHARS) {
256 /* Write into the stack buffer; nallocated can't overflow.
257 * At the end, we'll allocate exactly as much heap space as it
258 * turns out we need.
259 */
260 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
261 result = NULL; /* will allocate after we're done */
262 p = stackbuf;
263 }
264 else {
265 if (size > PY_SSIZE_T_MAX / max_char_size) {
266 /* integer overflow */
267 return PyErr_NoMemory();
268 }
269 /* Overallocate on the heap, and give the excess back at the end. */
270 nallocated = size * max_char_size;
271 result = PyBytes_FromStringAndSize(NULL, nallocated);
272 if (result == NULL)
273 return NULL;
274 p = PyBytes_AS_STRING(result);
275 }
276
277 for (i = 0; i < size;) {
278 Py_UCS4 ch = data[i++];
279
280 if (ch < 0x80) {
281 /* Encode ASCII */
282 *p++ = (char) ch;
283
284 }
285 else
286#if STRINGLIB_SIZEOF_CHAR > 1
287 if (ch < 0x0800)
288#endif
289 {
290 /* Encode Latin-1 */
291 *p++ = (char)(0xc0 | (ch >> 6));
292 *p++ = (char)(0x80 | (ch & 0x3f));
293 }
294#if STRINGLIB_SIZEOF_CHAR > 1
295 else if (Py_UNICODE_IS_SURROGATE(ch)) {
296 Py_ssize_t newpos;
297 Py_ssize_t repsize, k, startpos;
298 startpos = i-1;
299 rep = unicode_encode_call_errorhandler(
300 errors, &errorHandler, "utf-8", "surrogates not allowed",
301 unicode, &exc, startpos, startpos+1, &newpos);
302 if (!rep)
303 goto error;
304
305 if (PyBytes_Check(rep))
306 repsize = PyBytes_GET_SIZE(rep);
307 else
308 repsize = PyUnicode_GET_LENGTH(rep);
309
310 if (repsize > max_char_size) {
311 Py_ssize_t offset;
312
313 if (result == NULL)
314 offset = p - stackbuf;
315 else
316 offset = p - PyBytes_AS_STRING(result);
317
318 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
319 /* integer overflow */
320 PyErr_NoMemory();
321 goto error;
322 }
323 nallocated += repsize - max_char_size;
324 if (result != NULL) {
325 if (_PyBytes_Resize(&result, nallocated) < 0)
326 goto error;
327 } else {
328 result = PyBytes_FromStringAndSize(NULL, nallocated);
329 if (result == NULL)
330 goto error;
331 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
332 }
333 p = PyBytes_AS_STRING(result) + offset;
334 }
335
336 if (PyBytes_Check(rep)) {
337 char *prep = PyBytes_AS_STRING(rep);
338 for(k = repsize; k > 0; k--)
339 *p++ = *prep++;
340 } else /* rep is unicode */ {
341 enum PyUnicode_Kind repkind;
342 void *repdata;
343
344 if (PyUnicode_READY(rep) < 0)
345 goto error;
346 repkind = PyUnicode_KIND(rep);
347 repdata = PyUnicode_DATA(rep);
348
349 for(k=0; k<repsize; k++) {
350 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
351 if (0x80 <= c) {
352 raise_encode_exception(&exc, "utf-8",
353 unicode,
354 i-1, i,
355 "surrogates not allowed");
356 goto error;
357 }
358 *p++ = (char)c;
359 }
360 }
361 Py_CLEAR(rep);
362 }
363 else
364#if STRINGLIB_SIZEOF_CHAR > 2
365 if (ch < 0x10000)
366#endif
367 {
368 *p++ = (char)(0xe0 | (ch >> 12));
369 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
370 *p++ = (char)(0x80 | (ch & 0x3f));
371 }
372#if STRINGLIB_SIZEOF_CHAR > 2
373 else /* ch >= 0x10000 */
374 {
375 assert(ch <= MAX_UNICODE);
376 /* Encode UCS4 Unicode ordinals */
377 *p++ = (char)(0xf0 | (ch >> 18));
378 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
379 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
380 *p++ = (char)(0x80 | (ch & 0x3f));
381 }
382#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
383#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
384 }
385
386 if (result == NULL) {
387 /* This was stack allocated. */
388 nneeded = p - stackbuf;
389 assert(nneeded <= nallocated);
390 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
391 }
392 else {
393 /* Cut back to size actually needed. */
394 nneeded = p - PyBytes_AS_STRING(result);
395 assert(nneeded <= nallocated);
396 _PyBytes_Resize(&result, nneeded);
397 }
398
399#if STRINGLIB_SIZEOF_CHAR > 1
400 Py_XDECREF(errorHandler);
401 Py_XDECREF(exc);
402#endif
403 return result;
404
405#if STRINGLIB_SIZEOF_CHAR > 1
406 error:
407 Py_XDECREF(rep);
408 Py_XDECREF(errorHandler);
409 Py_XDECREF(exc);
410 Py_XDECREF(result);
411 return NULL;
412#endif
413
414#undef MAX_SHORT_UNICHARS
415}
416
Antoine Pitrou63065d72012-05-15 23:48:04 +0200417/* The pattern for constructing UCS2-repeated masks. */
418#if SIZEOF_LONG == 8
419# define UCS2_REPEAT_MASK 0x0001000100010001ul
420#elif SIZEOF_LONG == 4
421# define UCS2_REPEAT_MASK 0x00010001ul
422#else
423# error C 'long' size should be either 4 or 8!
424#endif
425
426/* The mask for fast checking. */
427#if STRINGLIB_SIZEOF_CHAR == 1
428/* The mask for fast checking of whether a C 'long' contains a
429 non-ASCII or non-Latin1 UTF16-encoded characters. */
430# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
431#else
432/* The mask for fast checking of whether a C 'long' may contain
433 UTF16-encoded surrogate characters. This is an efficient heuristic,
434 assuming that non-surrogate characters with a code point >= 0x8000 are
435 rare in most input.
436*/
437# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
438#endif
439/* The mask for fast byte-swapping. */
440#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
441/* Swap bytes. */
442#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
443 (((value) & STRIPPED_MASK) << 8))
444
445Py_LOCAL_INLINE(Py_UCS4)
446STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
447 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
448 int native_ordering)
449{
450 Py_UCS4 ch;
451 const unsigned char *aligned_end =
452 (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
453 const unsigned char *q = *inptr;
454 STRINGLIB_CHAR *p = dest + *outpos;
455 /* Offsets from q for retrieving byte pairs in the right order. */
456#ifdef BYTEORDER_IS_LITTLE_ENDIAN
457 int ihi = !!native_ordering, ilo = !native_ordering;
458#else
459 int ihi = !native_ordering, ilo = !!native_ordering;
460#endif
461 --e;
462
463 while (q < e) {
464 Py_UCS4 ch2;
465 /* First check for possible aligned read of a C 'long'. Unaligned
466 reads are more expensive, better to defer to another iteration. */
467 if (!((size_t) q & LONG_PTR_MASK)) {
468 /* Fast path for runs of in-range non-surrogate chars. */
469 register const unsigned char *_q = q;
470 while (_q < aligned_end) {
471 unsigned long block = * (unsigned long *) _q;
472 if (native_ordering) {
473 /* Can use buffer directly */
474 if (block & FAST_CHAR_MASK)
475 break;
476 }
477 else {
478 /* Need to byte-swap */
479 if (block & SWAB(FAST_CHAR_MASK))
480 break;
481#if STRINGLIB_SIZEOF_CHAR == 1
482 block >>= 8;
483#else
484 block = SWAB(block);
485#endif
486 }
487#ifdef BYTEORDER_IS_LITTLE_ENDIAN
488# if SIZEOF_LONG == 4
489 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
490 p[1] = (STRINGLIB_CHAR)(block >> 16);
491# elif SIZEOF_LONG == 8
492 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
493 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
494 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
495 p[3] = (STRINGLIB_CHAR)(block >> 48);
496# endif
497#else
498# if SIZEOF_LONG == 4
499 p[0] = (STRINGLIB_CHAR)(block >> 16);
500 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
501# elif SIZEOF_LONG == 8
502 p[0] = (STRINGLIB_CHAR)(block >> 48);
503 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
504 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
505 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
506# endif
507#endif
508 _q += SIZEOF_LONG;
509 p += SIZEOF_LONG / 2;
510 }
511 q = _q;
512 if (q >= e)
513 break;
514 }
515
516 ch = (q[ihi] << 8) | q[ilo];
517 q += 2;
518 if (!Py_UNICODE_IS_SURROGATE(ch)) {
519#if STRINGLIB_SIZEOF_CHAR < 2
520 if (ch > STRINGLIB_MAX_CHAR)
521 /* Out-of-range */
522 goto Return;
523#endif
524 *p++ = (STRINGLIB_CHAR)ch;
525 continue;
526 }
527
528 /* UTF-16 code pair: */
529 if (q >= e)
530 goto UnexpectedEnd;
531 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
532 goto IllegalEncoding;
533 ch2 = (q[ihi] << 8) | q[ilo];
534 q += 2;
535 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
536 goto IllegalSurrogate;
537 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
538#if STRINGLIB_SIZEOF_CHAR < 4
539 /* Out-of-range */
540 goto Return;
541#else
542 *p++ = (STRINGLIB_CHAR)ch;
543#endif
544 }
545 ch = 0;
546Return:
547 *inptr = q;
548 *outpos = p - dest;
549 return ch;
550UnexpectedEnd:
551 ch = 1;
552 goto Return;
553IllegalEncoding:
554 ch = 2;
555 goto Return;
556IllegalSurrogate:
557 ch = 3;
558 goto Return;
559}
560#undef UCS2_REPEAT_MASK
561#undef FAST_CHAR_MASK
562#undef STRIPPED_MASK
563#undef SWAB
564#undef LONG_PTR_MASK
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200565
566
567Py_LOCAL_INLINE(void)
568STRINGLIB(utf16_encode)(unsigned short *out,
569 const STRINGLIB_CHAR *in,
570 Py_ssize_t len,
571 int native_ordering)
572{
573 const STRINGLIB_CHAR *end = in + len;
574#if STRINGLIB_SIZEOF_CHAR == 1
575# define SWAB2(CH) ((CH) << 8)
576#else
577# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
578#endif
579#if STRINGLIB_MAX_CHAR < 0x10000
580 if (native_ordering) {
581# if STRINGLIB_SIZEOF_CHAR == 2
582 Py_MEMCPY(out, in, 2 * len);
583# else
584 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
585# endif
586 } else {
587 const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3);
588 while (in < unrolled_end) {
589 out[0] = SWAB2(in[0]);
590 out[1] = SWAB2(in[1]);
591 out[2] = SWAB2(in[2]);
592 out[3] = SWAB2(in[3]);
593 in += 4; out += 4;
594 }
595 while (in < end) {
596 *out++ = SWAB2(*in);
597 ++in;
598 }
599 }
600#else
601 if (native_ordering) {
602 while (in < end) {
603 Py_UCS4 ch = *in++;
604 if (ch < 0x10000)
605 *out++ = ch;
606 else {
607 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
608 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
609 out += 2;
610 }
611 }
612 } else {
613 while (in < end) {
614 Py_UCS4 ch = *in++;
615 if (ch < 0x10000)
616 *out++ = SWAB2((Py_UCS2)ch);
617 else {
618 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
619 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
620 out[0] = SWAB2(ch1);
621 out[1] = SWAB2(ch2);
622 out += 2;
623 }
624 }
625 }
626#endif
627#undef SWAB2
628}
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100629#endif /* STRINGLIB_IS_UNICODE */