blob: d7a991855bdc4754edc2d1703bcbbd9571cfa320 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020041 const char *_s = s;
42 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010043 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020050#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020051 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xE0) {
95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010096 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200103 if (end - s < 2) {
104 /* unexpected end of data: the caller will decide whether
105 it's an error or not */
106 break;
107 }
108 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100109 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200110 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200111 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 ch = (ch << 6) + ch2 -
113 ((0xC0 << 6) + 0x80);
114 assert ((ch > 0x007F) && (ch <= 0x07FF));
115 s += 2;
116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200118 /* Out-of-range */
119 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200137 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 ch2 = (unsigned char)s[1];
140 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200141 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200142 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 }
145 if (ch == 0xE0) {
146 if (ch2 < 0xA0)
147 /* invalid sequence
148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200149 goto InvalidContinuation1;
150 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
152 will result in surrogates in range D800-DFFF. Surrogates are
153 not valid UTF-8 so they are rejected.
154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 }
162 ch = (ch << 12) + (ch2 << 6) + ch3 -
163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
165 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200168 /* Out-of-range */
169 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100170 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200171 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173
174 if (ch < 0xF5) {
175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
176 Py_UCS4 ch2, ch3, ch4;
177 if (end - s < 4) {
178 /* unexpected end of data: the caller will decide whether
179 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200192 break;
193 }
194 ch2 = (unsigned char)s[1];
195 ch3 = (unsigned char)s[2];
196 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200197 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200198 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 }
201 if (ch == 0xF0) {
202 if (ch2 < 0x90)
203 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
205 goto InvalidContinuation1;
206 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 /* invalid sequence
208 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200218 }
219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
222 s += 4;
223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200225 /* Out-of-range */
226 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200227 *p++ = ch;
228 continue;
229 }
230 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200232 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200233Return:
234 *inptr = s;
235 *outpos = p - dest;
236 return ch;
237InvalidStart:
238 ch = 1;
239 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200240InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200241 ch = 2;
242 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200243InvalidContinuation2:
244 ch = 3;
245 goto Return;
246InvalidContinuation3:
247 ch = 4;
248 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100249}
250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251#undef ASCII_CHAR_MASK
252
Victor Stinner6099a032011-12-18 14:22:26 +0100253
254/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
256 UCS-1 strings don't need to handle surrogates for example. */
257Py_LOCAL_INLINE(PyObject *)
258STRINGLIB(utf8_encoder)(PyObject *unicode,
259 STRINGLIB_CHAR *data,
260 Py_ssize_t size,
261 const char *errors)
262{
263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
264
265 Py_ssize_t i; /* index into s of next input byte */
Victor Stinner6099a032011-12-18 14:22:26 +0100266 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100267#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200268 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100269 PyObject *exc = NULL;
270 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200271 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100272#endif
273#if STRINGLIB_SIZEOF_CHAR == 1
274 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100275#elif STRINGLIB_SIZEOF_CHAR == 2
276 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200280 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100281
282 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200283 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100284
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 if (size > PY_SSIZE_T_MAX / max_char_size) {
286 /* integer overflow */
287 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100288 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200289
290 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291 if (p == NULL)
292 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303#if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305#endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311#if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200313 Py_ssize_t startpos, endpos, newpos;
314 Py_ssize_t repsize, k;
315 if (error_handler == _Py_ERROR_UNKNOWN)
316 error_handler = get_error_handler(errors);
317
Victor Stinner6099a032011-12-18 14:22:26 +0100318 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200319 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100320
Victor Stinner01ada392015-10-01 21:54:51 +0200321 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
322 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100323
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200324 /* Only overallocate the buffer if it's not the last write */
325 writer.overallocate = (endpos < size);
326
Victor Stinner01ada392015-10-01 21:54:51 +0200327 switch (error_handler)
328 {
329 case _Py_ERROR_REPLACE:
330 memset(p, '?', endpos - startpos);
331 p += (endpos - startpos);
332 /* fall through the ignore handler */
333 case _Py_ERROR_IGNORE:
334 i += (endpos - startpos - 1);
335 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100336
Victor Stinner01ada392015-10-01 21:54:51 +0200337
338 case _Py_ERROR_SURROGATEPASS:
339 for (k=startpos; k<endpos; k++) {
340 ch = data[k];
341 *p++ = (char)(0xe0 | (ch >> 12));
342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
343 *p++ = (char)(0x80 | (ch & 0x3f));
344 }
345 i += (endpos - startpos - 1);
346 break;
347
348 case _Py_ERROR_SURROGATEESCAPE:
349 for (k=startpos; k<endpos; k++) {
350 ch = data[k];
351 if (!(0xDC80 <= ch && ch <= 0xDCFF))
352 break;
353 *p++ = (char)(ch & 0xff);
354 }
355 if (k >= endpos) {
356 i += (endpos - startpos - 1);
357 break;
358 }
359 startpos = k;
360 assert(startpos < endpos);
361 /* fall through the default handler */
362
363 default:
364 rep = unicode_encode_call_errorhandler(
365 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
366 unicode, &exc, startpos, endpos, &newpos);
367 if (!rep)
368 goto error;
369
370 if (PyBytes_Check(rep))
371 repsize = PyBytes_GET_SIZE(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100372 else
Victor Stinner01ada392015-10-01 21:54:51 +0200373 repsize = PyUnicode_GET_LENGTH(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100374
Victor Stinner01ada392015-10-01 21:54:51 +0200375 if (repsize > max_char_size) {
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200376 p = _PyBytesWriter_Prepare(&writer, p,
377 repsize - max_char_size);
378 if (p == NULL)
Victor Stinner6099a032011-12-18 14:22:26 +0100379 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100380 }
Victor Stinner6099a032011-12-18 14:22:26 +0100381
Victor Stinner01ada392015-10-01 21:54:51 +0200382 if (PyBytes_Check(rep)) {
383 memcpy(p, PyBytes_AS_STRING(rep), repsize);
384 p += repsize;
385 }
386 else {
387 /* rep is unicode */
388 if (PyUnicode_READY(rep) < 0)
389 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100390
Victor Stinner01ada392015-10-01 21:54:51 +0200391 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100392 raise_encode_exception(&exc, "utf-8",
393 unicode,
394 i-1, i,
395 "surrogates not allowed");
396 goto error;
397 }
Victor Stinner01ada392015-10-01 21:54:51 +0200398
399 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
400 memcpy(p, PyUnicode_DATA(rep), repsize);
401 p += repsize;
Victor Stinner6099a032011-12-18 14:22:26 +0100402 }
Victor Stinner01ada392015-10-01 21:54:51 +0200403 Py_CLEAR(rep);
404
405 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100406 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200407
408 /* If overallocation was disabled, ensure that it was the last
409 write. Otherwise, we missed an optimization */
410 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100411 }
412 else
413#if STRINGLIB_SIZEOF_CHAR > 2
414 if (ch < 0x10000)
415#endif
416 {
417 *p++ = (char)(0xe0 | (ch >> 12));
418 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
419 *p++ = (char)(0x80 | (ch & 0x3f));
420 }
421#if STRINGLIB_SIZEOF_CHAR > 2
422 else /* ch >= 0x10000 */
423 {
424 assert(ch <= MAX_UNICODE);
425 /* Encode UCS4 Unicode ordinals */
426 *p++ = (char)(0xf0 | (ch >> 18));
427 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
428 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
429 *p++ = (char)(0x80 | (ch & 0x3f));
430 }
431#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
432#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
433 }
434
Victor Stinner6099a032011-12-18 14:22:26 +0100435#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200436 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100437 Py_XDECREF(exc);
438#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200439 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100440
441#if STRINGLIB_SIZEOF_CHAR > 1
442 error:
443 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200444 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100445 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200446 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100447 return NULL;
448#endif
449
450#undef MAX_SHORT_UNICHARS
451}
452
Antoine Pitrou63065d72012-05-15 23:48:04 +0200453/* The pattern for constructing UCS2-repeated masks. */
454#if SIZEOF_LONG == 8
455# define UCS2_REPEAT_MASK 0x0001000100010001ul
456#elif SIZEOF_LONG == 4
457# define UCS2_REPEAT_MASK 0x00010001ul
458#else
459# error C 'long' size should be either 4 or 8!
460#endif
461
462/* The mask for fast checking. */
463#if STRINGLIB_SIZEOF_CHAR == 1
464/* The mask for fast checking of whether a C 'long' contains a
465 non-ASCII or non-Latin1 UTF16-encoded characters. */
466# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
467#else
468/* The mask for fast checking of whether a C 'long' may contain
469 UTF16-encoded surrogate characters. This is an efficient heuristic,
470 assuming that non-surrogate characters with a code point >= 0x8000 are
471 rare in most input.
472*/
473# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
474#endif
475/* The mask for fast byte-swapping. */
476#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
477/* Swap bytes. */
478#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
479 (((value) & STRIPPED_MASK) << 8))
480
481Py_LOCAL_INLINE(Py_UCS4)
482STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
483 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
484 int native_ordering)
485{
486 Py_UCS4 ch;
487 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200488 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200489 const unsigned char *q = *inptr;
490 STRINGLIB_CHAR *p = dest + *outpos;
491 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200492#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200493 int ihi = !!native_ordering, ilo = !native_ordering;
494#else
495 int ihi = !native_ordering, ilo = !!native_ordering;
496#endif
497 --e;
498
499 while (q < e) {
500 Py_UCS4 ch2;
501 /* First check for possible aligned read of a C 'long'. Unaligned
502 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200503 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200504 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200505 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200506 while (_q < aligned_end) {
507 unsigned long block = * (unsigned long *) _q;
508 if (native_ordering) {
509 /* Can use buffer directly */
510 if (block & FAST_CHAR_MASK)
511 break;
512 }
513 else {
514 /* Need to byte-swap */
515 if (block & SWAB(FAST_CHAR_MASK))
516 break;
517#if STRINGLIB_SIZEOF_CHAR == 1
518 block >>= 8;
519#else
520 block = SWAB(block);
521#endif
522 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200523#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200524# if SIZEOF_LONG == 4
525 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
526 p[1] = (STRINGLIB_CHAR)(block >> 16);
527# elif SIZEOF_LONG == 8
528 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
529 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
530 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
531 p[3] = (STRINGLIB_CHAR)(block >> 48);
532# endif
533#else
534# if SIZEOF_LONG == 4
535 p[0] = (STRINGLIB_CHAR)(block >> 16);
536 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
537# elif SIZEOF_LONG == 8
538 p[0] = (STRINGLIB_CHAR)(block >> 48);
539 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
540 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
541 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
542# endif
543#endif
544 _q += SIZEOF_LONG;
545 p += SIZEOF_LONG / 2;
546 }
547 q = _q;
548 if (q >= e)
549 break;
550 }
551
552 ch = (q[ihi] << 8) | q[ilo];
553 q += 2;
554 if (!Py_UNICODE_IS_SURROGATE(ch)) {
555#if STRINGLIB_SIZEOF_CHAR < 2
556 if (ch > STRINGLIB_MAX_CHAR)
557 /* Out-of-range */
558 goto Return;
559#endif
560 *p++ = (STRINGLIB_CHAR)ch;
561 continue;
562 }
563
564 /* UTF-16 code pair: */
565 if (q >= e)
566 goto UnexpectedEnd;
567 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
568 goto IllegalEncoding;
569 ch2 = (q[ihi] << 8) | q[ilo];
570 q += 2;
571 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
572 goto IllegalSurrogate;
573 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
574#if STRINGLIB_SIZEOF_CHAR < 4
575 /* Out-of-range */
576 goto Return;
577#else
578 *p++ = (STRINGLIB_CHAR)ch;
579#endif
580 }
581 ch = 0;
582Return:
583 *inptr = q;
584 *outpos = p - dest;
585 return ch;
586UnexpectedEnd:
587 ch = 1;
588 goto Return;
589IllegalEncoding:
590 ch = 2;
591 goto Return;
592IllegalSurrogate:
593 ch = 3;
594 goto Return;
595}
596#undef UCS2_REPEAT_MASK
597#undef FAST_CHAR_MASK
598#undef STRIPPED_MASK
599#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200600
601
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200602#if STRINGLIB_MAX_CHAR >= 0x80
603Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200604STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
605 Py_ssize_t len,
606 unsigned short **outptr,
607 int native_ordering)
608{
609 unsigned short *out = *outptr;
610 const STRINGLIB_CHAR *end = in + len;
611#if STRINGLIB_SIZEOF_CHAR == 1
612 if (native_ordering) {
613 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
614 while (in < unrolled_end) {
615 out[0] = in[0];
616 out[1] = in[1];
617 out[2] = in[2];
618 out[3] = in[3];
619 in += 4; out += 4;
620 }
621 while (in < end) {
622 *out++ = *in++;
623 }
624 } else {
625# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200626 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200627 while (in < unrolled_end) {
628 out[0] = SWAB2(in[0]);
629 out[1] = SWAB2(in[1]);
630 out[2] = SWAB2(in[2]);
631 out[3] = SWAB2(in[3]);
632 in += 4; out += 4;
633 }
634 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200635 Py_UCS4 ch = *in++;
636 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200637 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200638#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200639 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200640 *outptr = out;
641 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200642#else
643 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200644#if STRINGLIB_MAX_CHAR < 0x10000
645 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
646 while (in < unrolled_end) {
647 /* check if any character is a surrogate character */
648 if (((in[0] ^ 0xd800) &
649 (in[1] ^ 0xd800) &
650 (in[2] ^ 0xd800) &
651 (in[3] ^ 0xd800) & 0xf800) == 0)
652 break;
653 out[0] = in[0];
654 out[1] = in[1];
655 out[2] = in[2];
656 out[3] = in[3];
657 in += 4; out += 4;
658 }
659#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200660 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200661 Py_UCS4 ch;
662 ch = *in++;
663 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200664 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200665 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300666 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200667 goto fail;
668#if STRINGLIB_MAX_CHAR >= 0x10000
669 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200670 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
671 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
672 out += 2;
673 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200674#endif
675 else
676 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200677 }
678 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200679#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
680#if STRINGLIB_MAX_CHAR < 0x10000
681 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
682 while (in < unrolled_end) {
683 /* check if any character is a surrogate character */
684 if (((in[0] ^ 0xd800) &
685 (in[1] ^ 0xd800) &
686 (in[2] ^ 0xd800) &
687 (in[3] ^ 0xd800) & 0xf800) == 0)
688 break;
689 out[0] = SWAB2(in[0]);
690 out[1] = SWAB2(in[1]);
691 out[2] = SWAB2(in[2]);
692 out[3] = SWAB2(in[3]);
693 in += 4; out += 4;
694 }
695#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200696 while (in < end) {
697 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200699 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200700 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300701 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200702 goto fail;
703#if STRINGLIB_MAX_CHAR >= 0x10000
704 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200705 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
706 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
707 out[0] = SWAB2(ch1);
708 out[1] = SWAB2(ch2);
709 out += 2;
710 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200711#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200712 else
713 *out++ = SWAB2((Py_UCS2)ch);
714 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200715#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716 }
717 *outptr = out;
718 return len;
719 fail:
720 *outptr = out;
721 return len - (end - in + 1);
722#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200723}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300724
725#if STRINGLIB_SIZEOF_CHAR == 1
726# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
727#elif STRINGLIB_SIZEOF_CHAR == 2
728# define SWAB4(CH, tmp) (tmp = (CH), \
729 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
730 /* high bytes are zero */
731#else
732# define SWAB4(CH, tmp) (tmp = (CH), \
733 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
734 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
735#endif
736Py_LOCAL_INLINE(Py_ssize_t)
737STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
738 Py_ssize_t len,
739 PY_UINT32_T **outptr,
740 int native_ordering)
741{
742 PY_UINT32_T *out = *outptr;
743 const STRINGLIB_CHAR *end = in + len;
744 if (native_ordering) {
745 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
746 while (in < unrolled_end) {
747#if STRINGLIB_SIZEOF_CHAR > 1
748 /* check if any character is a surrogate character */
749 if (((in[0] ^ 0xd800) &
750 (in[1] ^ 0xd800) &
751 (in[2] ^ 0xd800) &
752 (in[3] ^ 0xd800) & 0xf800) == 0)
753 break;
754#endif
755 out[0] = in[0];
756 out[1] = in[1];
757 out[2] = in[2];
758 out[3] = in[3];
759 in += 4; out += 4;
760 }
761 while (in < end) {
762 Py_UCS4 ch;
763 ch = *in++;
764#if STRINGLIB_SIZEOF_CHAR > 1
765 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300766 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300767 goto fail;
768 }
769#endif
770 *out++ = ch;
771 }
772 } else {
773 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
774 while (in < unrolled_end) {
775#if STRINGLIB_SIZEOF_CHAR > 1
776 Py_UCS4 ch1, ch2, ch3, ch4;
777 /* check if any character is a surrogate character */
778 if (((in[0] ^ 0xd800) &
779 (in[1] ^ 0xd800) &
780 (in[2] ^ 0xd800) &
781 (in[3] ^ 0xd800) & 0xf800) == 0)
782 break;
783#endif
784 out[0] = SWAB4(in[0], ch1);
785 out[1] = SWAB4(in[1], ch2);
786 out[2] = SWAB4(in[2], ch3);
787 out[3] = SWAB4(in[3], ch4);
788 in += 4; out += 4;
789 }
790 while (in < end) {
791 Py_UCS4 ch = *in++;
792#if STRINGLIB_SIZEOF_CHAR > 1
793 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300794 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300795 goto fail;
796 }
797#endif
798 *out++ = SWAB4(ch, ch);
799 }
800 }
801 *outptr = out;
802 return len;
803#if STRINGLIB_SIZEOF_CHAR > 1
804 fail:
805 *outptr = out;
806 return len - (end - in + 1);
807#endif
808}
809#undef SWAB4
810
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200811#endif
812
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100813#endif /* STRINGLIB_IS_UNICODE */