blob: f353367013a61b4c776676c81bc1b2999b50f654 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
41 register const char *_s = s;
42 register STRINGLIB_CHAR *_p = p;
43 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020050#ifdef BYTEORDER_IS_LITTLE_ENDIAN
51 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xE0) {
95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010096 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200103 if (end - s < 2) {
104 /* unexpected end of data: the caller will decide whether
105 it's an error or not */
106 break;
107 }
108 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100109 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200110 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200111 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 ch = (ch << 6) + ch2 -
113 ((0xC0 << 6) + 0x80);
114 assert ((ch > 0x007F) && (ch <= 0x07FF));
115 s += 2;
116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200118 /* Out-of-range */
119 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200137 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 ch2 = (unsigned char)s[1];
140 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200141 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200142 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 }
145 if (ch == 0xE0) {
146 if (ch2 < 0xA0)
147 /* invalid sequence
148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200149 goto InvalidContinuation1;
150 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
152 will result in surrogates in range D800-DFFF. Surrogates are
153 not valid UTF-8 so they are rejected.
154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 }
162 ch = (ch << 12) + (ch2 << 6) + ch3 -
163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
165 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200168 /* Out-of-range */
169 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100170 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200171 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173
174 if (ch < 0xF5) {
175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
176 Py_UCS4 ch2, ch3, ch4;
177 if (end - s < 4) {
178 /* unexpected end of data: the caller will decide whether
179 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200192 break;
193 }
194 ch2 = (unsigned char)s[1];
195 ch3 = (unsigned char)s[2];
196 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200197 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200198 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 }
201 if (ch == 0xF0) {
202 if (ch2 < 0x90)
203 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
205 goto InvalidContinuation1;
206 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 /* invalid sequence
208 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200218 }
219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
222 s += 4;
223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200225 /* Out-of-range */
226 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200227 *p++ = ch;
228 continue;
229 }
230 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200232 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200233Return:
234 *inptr = s;
235 *outpos = p - dest;
236 return ch;
237InvalidStart:
238 ch = 1;
239 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200240InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200241 ch = 2;
242 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200243InvalidContinuation2:
244 ch = 3;
245 goto Return;
246InvalidContinuation3:
247 ch = 4;
248 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100249}
250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251#undef ASCII_CHAR_MASK
252
Victor Stinner6099a032011-12-18 14:22:26 +0100253
254/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
256 UCS-1 strings don't need to handle surrogates for example. */
257Py_LOCAL_INLINE(PyObject *)
258STRINGLIB(utf8_encoder)(PyObject *unicode,
259 STRINGLIB_CHAR *data,
260 Py_ssize_t size,
261 const char *errors)
262{
263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
264
265 Py_ssize_t i; /* index into s of next input byte */
266 PyObject *result; /* result string object */
267 char *p; /* next free byte in output buffer */
268 Py_ssize_t nallocated; /* number of result bytes allocated */
269 Py_ssize_t nneeded; /* number of result bytes needed */
270#if STRINGLIB_SIZEOF_CHAR > 1
271 PyObject *errorHandler = NULL;
272 PyObject *exc = NULL;
273 PyObject *rep = NULL;
274#endif
275#if STRINGLIB_SIZEOF_CHAR == 1
276 const Py_ssize_t max_char_size = 2;
277 char stackbuf[MAX_SHORT_UNICHARS * 2];
278#elif STRINGLIB_SIZEOF_CHAR == 2
279 const Py_ssize_t max_char_size = 3;
280 char stackbuf[MAX_SHORT_UNICHARS * 3];
281#else /* STRINGLIB_SIZEOF_CHAR == 4 */
282 const Py_ssize_t max_char_size = 4;
283 char stackbuf[MAX_SHORT_UNICHARS * 4];
284#endif
285
286 assert(size >= 0);
287
288 if (size <= MAX_SHORT_UNICHARS) {
289 /* Write into the stack buffer; nallocated can't overflow.
290 * At the end, we'll allocate exactly as much heap space as it
291 * turns out we need.
292 */
293 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
294 result = NULL; /* will allocate after we're done */
295 p = stackbuf;
296 }
297 else {
298 if (size > PY_SSIZE_T_MAX / max_char_size) {
299 /* integer overflow */
300 return PyErr_NoMemory();
301 }
302 /* Overallocate on the heap, and give the excess back at the end. */
303 nallocated = size * max_char_size;
304 result = PyBytes_FromStringAndSize(NULL, nallocated);
305 if (result == NULL)
306 return NULL;
307 p = PyBytes_AS_STRING(result);
308 }
309
310 for (i = 0; i < size;) {
311 Py_UCS4 ch = data[i++];
312
313 if (ch < 0x80) {
314 /* Encode ASCII */
315 *p++ = (char) ch;
316
317 }
318 else
319#if STRINGLIB_SIZEOF_CHAR > 1
320 if (ch < 0x0800)
321#endif
322 {
323 /* Encode Latin-1 */
324 *p++ = (char)(0xc0 | (ch >> 6));
325 *p++ = (char)(0x80 | (ch & 0x3f));
326 }
327#if STRINGLIB_SIZEOF_CHAR > 1
328 else if (Py_UNICODE_IS_SURROGATE(ch)) {
329 Py_ssize_t newpos;
330 Py_ssize_t repsize, k, startpos;
331 startpos = i-1;
332 rep = unicode_encode_call_errorhandler(
333 errors, &errorHandler, "utf-8", "surrogates not allowed",
334 unicode, &exc, startpos, startpos+1, &newpos);
335 if (!rep)
336 goto error;
337
338 if (PyBytes_Check(rep))
339 repsize = PyBytes_GET_SIZE(rep);
340 else
341 repsize = PyUnicode_GET_LENGTH(rep);
342
343 if (repsize > max_char_size) {
344 Py_ssize_t offset;
345
346 if (result == NULL)
347 offset = p - stackbuf;
348 else
349 offset = p - PyBytes_AS_STRING(result);
350
351 if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
352 /* integer overflow */
353 PyErr_NoMemory();
354 goto error;
355 }
356 nallocated += repsize - max_char_size;
357 if (result != NULL) {
358 if (_PyBytes_Resize(&result, nallocated) < 0)
359 goto error;
360 } else {
361 result = PyBytes_FromStringAndSize(NULL, nallocated);
362 if (result == NULL)
363 goto error;
364 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
365 }
366 p = PyBytes_AS_STRING(result) + offset;
367 }
368
369 if (PyBytes_Check(rep)) {
370 char *prep = PyBytes_AS_STRING(rep);
371 for(k = repsize; k > 0; k--)
372 *p++ = *prep++;
373 } else /* rep is unicode */ {
374 enum PyUnicode_Kind repkind;
375 void *repdata;
376
377 if (PyUnicode_READY(rep) < 0)
378 goto error;
379 repkind = PyUnicode_KIND(rep);
380 repdata = PyUnicode_DATA(rep);
381
382 for(k=0; k<repsize; k++) {
383 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
384 if (0x80 <= c) {
385 raise_encode_exception(&exc, "utf-8",
386 unicode,
387 i-1, i,
388 "surrogates not allowed");
389 goto error;
390 }
391 *p++ = (char)c;
392 }
393 }
394 Py_CLEAR(rep);
395 }
396 else
397#if STRINGLIB_SIZEOF_CHAR > 2
398 if (ch < 0x10000)
399#endif
400 {
401 *p++ = (char)(0xe0 | (ch >> 12));
402 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
403 *p++ = (char)(0x80 | (ch & 0x3f));
404 }
405#if STRINGLIB_SIZEOF_CHAR > 2
406 else /* ch >= 0x10000 */
407 {
408 assert(ch <= MAX_UNICODE);
409 /* Encode UCS4 Unicode ordinals */
410 *p++ = (char)(0xf0 | (ch >> 18));
411 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
412 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
413 *p++ = (char)(0x80 | (ch & 0x3f));
414 }
415#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
416#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
417 }
418
419 if (result == NULL) {
420 /* This was stack allocated. */
421 nneeded = p - stackbuf;
422 assert(nneeded <= nallocated);
423 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
424 }
425 else {
426 /* Cut back to size actually needed. */
427 nneeded = p - PyBytes_AS_STRING(result);
428 assert(nneeded <= nallocated);
429 _PyBytes_Resize(&result, nneeded);
430 }
431
432#if STRINGLIB_SIZEOF_CHAR > 1
433 Py_XDECREF(errorHandler);
434 Py_XDECREF(exc);
435#endif
436 return result;
437
438#if STRINGLIB_SIZEOF_CHAR > 1
439 error:
440 Py_XDECREF(rep);
441 Py_XDECREF(errorHandler);
442 Py_XDECREF(exc);
443 Py_XDECREF(result);
444 return NULL;
445#endif
446
447#undef MAX_SHORT_UNICHARS
448}
449
Antoine Pitrou63065d72012-05-15 23:48:04 +0200450/* The pattern for constructing UCS2-repeated masks. */
451#if SIZEOF_LONG == 8
452# define UCS2_REPEAT_MASK 0x0001000100010001ul
453#elif SIZEOF_LONG == 4
454# define UCS2_REPEAT_MASK 0x00010001ul
455#else
456# error C 'long' size should be either 4 or 8!
457#endif
458
459/* The mask for fast checking. */
460#if STRINGLIB_SIZEOF_CHAR == 1
461/* The mask for fast checking of whether a C 'long' contains a
462 non-ASCII or non-Latin1 UTF16-encoded characters. */
463# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
464#else
465/* The mask for fast checking of whether a C 'long' may contain
466 UTF16-encoded surrogate characters. This is an efficient heuristic,
467 assuming that non-surrogate characters with a code point >= 0x8000 are
468 rare in most input.
469*/
470# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
471#endif
472/* The mask for fast byte-swapping. */
473#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
474/* Swap bytes. */
475#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
476 (((value) & STRIPPED_MASK) << 8))
477
478Py_LOCAL_INLINE(Py_UCS4)
479STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
480 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
481 int native_ordering)
482{
483 Py_UCS4 ch;
484 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200485 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200486 const unsigned char *q = *inptr;
487 STRINGLIB_CHAR *p = dest + *outpos;
488 /* Offsets from q for retrieving byte pairs in the right order. */
489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
490 int ihi = !!native_ordering, ilo = !native_ordering;
491#else
492 int ihi = !native_ordering, ilo = !!native_ordering;
493#endif
494 --e;
495
496 while (q < e) {
497 Py_UCS4 ch2;
498 /* First check for possible aligned read of a C 'long'. Unaligned
499 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200500 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200501 /* Fast path for runs of in-range non-surrogate chars. */
502 register const unsigned char *_q = q;
503 while (_q < aligned_end) {
504 unsigned long block = * (unsigned long *) _q;
505 if (native_ordering) {
506 /* Can use buffer directly */
507 if (block & FAST_CHAR_MASK)
508 break;
509 }
510 else {
511 /* Need to byte-swap */
512 if (block & SWAB(FAST_CHAR_MASK))
513 break;
514#if STRINGLIB_SIZEOF_CHAR == 1
515 block >>= 8;
516#else
517 block = SWAB(block);
518#endif
519 }
520#ifdef BYTEORDER_IS_LITTLE_ENDIAN
521# if SIZEOF_LONG == 4
522 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
523 p[1] = (STRINGLIB_CHAR)(block >> 16);
524# elif SIZEOF_LONG == 8
525 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
526 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
527 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
528 p[3] = (STRINGLIB_CHAR)(block >> 48);
529# endif
530#else
531# if SIZEOF_LONG == 4
532 p[0] = (STRINGLIB_CHAR)(block >> 16);
533 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
534# elif SIZEOF_LONG == 8
535 p[0] = (STRINGLIB_CHAR)(block >> 48);
536 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
537 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
538 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
539# endif
540#endif
541 _q += SIZEOF_LONG;
542 p += SIZEOF_LONG / 2;
543 }
544 q = _q;
545 if (q >= e)
546 break;
547 }
548
549 ch = (q[ihi] << 8) | q[ilo];
550 q += 2;
551 if (!Py_UNICODE_IS_SURROGATE(ch)) {
552#if STRINGLIB_SIZEOF_CHAR < 2
553 if (ch > STRINGLIB_MAX_CHAR)
554 /* Out-of-range */
555 goto Return;
556#endif
557 *p++ = (STRINGLIB_CHAR)ch;
558 continue;
559 }
560
561 /* UTF-16 code pair: */
562 if (q >= e)
563 goto UnexpectedEnd;
564 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
565 goto IllegalEncoding;
566 ch2 = (q[ihi] << 8) | q[ilo];
567 q += 2;
568 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
569 goto IllegalSurrogate;
570 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
571#if STRINGLIB_SIZEOF_CHAR < 4
572 /* Out-of-range */
573 goto Return;
574#else
575 *p++ = (STRINGLIB_CHAR)ch;
576#endif
577 }
578 ch = 0;
579Return:
580 *inptr = q;
581 *outpos = p - dest;
582 return ch;
583UnexpectedEnd:
584 ch = 1;
585 goto Return;
586IllegalEncoding:
587 ch = 2;
588 goto Return;
589IllegalSurrogate:
590 ch = 3;
591 goto Return;
592}
593#undef UCS2_REPEAT_MASK
594#undef FAST_CHAR_MASK
595#undef STRIPPED_MASK
596#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200597
598
599Py_LOCAL_INLINE(void)
600STRINGLIB(utf16_encode)(unsigned short *out,
601 const STRINGLIB_CHAR *in,
602 Py_ssize_t len,
603 int native_ordering)
604{
605 const STRINGLIB_CHAR *end = in + len;
606#if STRINGLIB_SIZEOF_CHAR == 1
607# define SWAB2(CH) ((CH) << 8)
608#else
609# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
610#endif
611#if STRINGLIB_MAX_CHAR < 0x10000
612 if (native_ordering) {
613# if STRINGLIB_SIZEOF_CHAR == 2
614 Py_MEMCPY(out, in, 2 * len);
615# else
616 _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
617# endif
618 } else {
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200619 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200620 while (in < unrolled_end) {
621 out[0] = SWAB2(in[0]);
622 out[1] = SWAB2(in[1]);
623 out[2] = SWAB2(in[2]);
624 out[3] = SWAB2(in[3]);
625 in += 4; out += 4;
626 }
627 while (in < end) {
628 *out++ = SWAB2(*in);
629 ++in;
630 }
631 }
632#else
633 if (native_ordering) {
634 while (in < end) {
635 Py_UCS4 ch = *in++;
636 if (ch < 0x10000)
637 *out++ = ch;
638 else {
639 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
640 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
641 out += 2;
642 }
643 }
644 } else {
645 while (in < end) {
646 Py_UCS4 ch = *in++;
647 if (ch < 0x10000)
648 *out++ = SWAB2((Py_UCS2)ch);
649 else {
650 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
651 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
652 out[0] = SWAB2(ch1);
653 out[1] = SWAB2(ch2);
654 out += 2;
655 }
656 }
657 }
658#endif
659#undef SWAB2
660}
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100661#endif /* STRINGLIB_IS_UNICODE */