blob: ae99d1a82df883bef5826738aa396ebf8622563c [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020041 const char *_s = s;
42 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010043 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020050#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020051 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xE0) {
95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010096 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200103 if (end - s < 2) {
104 /* unexpected end of data: the caller will decide whether
105 it's an error or not */
106 break;
107 }
108 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100109 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200110 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200111 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 ch = (ch << 6) + ch2 -
113 ((0xC0 << 6) + 0x80);
114 assert ((ch > 0x007F) && (ch <= 0x07FF));
115 s += 2;
116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200118 /* Out-of-range */
119 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200137 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 ch2 = (unsigned char)s[1];
140 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200141 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200142 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 }
145 if (ch == 0xE0) {
146 if (ch2 < 0xA0)
147 /* invalid sequence
148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200149 goto InvalidContinuation1;
150 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
152 will result in surrogates in range D800-DFFF. Surrogates are
153 not valid UTF-8 so they are rejected.
154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 }
162 ch = (ch << 12) + (ch2 << 6) + ch3 -
163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
165 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200168 /* Out-of-range */
169 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100170 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200171 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173
174 if (ch < 0xF5) {
175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
176 Py_UCS4 ch2, ch3, ch4;
177 if (end - s < 4) {
178 /* unexpected end of data: the caller will decide whether
179 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200192 break;
193 }
194 ch2 = (unsigned char)s[1];
195 ch3 = (unsigned char)s[2];
196 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200197 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200198 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 }
201 if (ch == 0xF0) {
202 if (ch2 < 0x90)
203 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
205 goto InvalidContinuation1;
206 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 /* invalid sequence
208 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200218 }
219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
222 s += 4;
223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200225 /* Out-of-range */
226 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200227 *p++ = ch;
228 continue;
229 }
230 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200232 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200233Return:
234 *inptr = s;
235 *outpos = p - dest;
236 return ch;
237InvalidStart:
238 ch = 1;
239 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200240InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200241 ch = 2;
242 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200243InvalidContinuation2:
244 ch = 3;
245 goto Return;
246InvalidContinuation3:
247 ch = 4;
248 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100249}
250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251#undef ASCII_CHAR_MASK
252
Victor Stinner6099a032011-12-18 14:22:26 +0100253
254/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
256 UCS-1 strings don't need to handle surrogates for example. */
257Py_LOCAL_INLINE(PyObject *)
258STRINGLIB(utf8_encoder)(PyObject *unicode,
259 STRINGLIB_CHAR *data,
260 Py_ssize_t size,
261 const char *errors)
262{
263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
264
265 Py_ssize_t i; /* index into s of next input byte */
Victor Stinner6099a032011-12-18 14:22:26 +0100266 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100267#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200268 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100269 PyObject *exc = NULL;
270 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200271 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100272#endif
273#if STRINGLIB_SIZEOF_CHAR == 1
274 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100275#elif STRINGLIB_SIZEOF_CHAR == 2
276 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200280 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100281
282 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200283 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100284
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 if (size > PY_SSIZE_T_MAX / max_char_size) {
286 /* integer overflow */
287 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100288 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200289
290 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291 if (p == NULL)
292 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303#if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305#endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311#if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200313 Py_ssize_t startpos, endpos, newpos;
314 Py_ssize_t repsize, k;
315 if (error_handler == _Py_ERROR_UNKNOWN)
316 error_handler = get_error_handler(errors);
317
Victor Stinner6099a032011-12-18 14:22:26 +0100318 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200319 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100320
Victor Stinner01ada392015-10-01 21:54:51 +0200321 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
322 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100323
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200324 /* Only overallocate the buffer if it's not the last write */
325 writer.overallocate = (endpos < size);
326
Victor Stinner01ada392015-10-01 21:54:51 +0200327 switch (error_handler)
328 {
329 case _Py_ERROR_REPLACE:
330 memset(p, '?', endpos - startpos);
331 p += (endpos - startpos);
332 /* fall through the ignore handler */
333 case _Py_ERROR_IGNORE:
334 i += (endpos - startpos - 1);
335 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100336
Victor Stinner01ada392015-10-01 21:54:51 +0200337 case _Py_ERROR_SURROGATEPASS:
338 for (k=startpos; k<endpos; k++) {
339 ch = data[k];
340 *p++ = (char)(0xe0 | (ch >> 12));
341 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
342 *p++ = (char)(0x80 | (ch & 0x3f));
343 }
344 i += (endpos - startpos - 1);
345 break;
346
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200347 case _Py_ERROR_BACKSLASHREPLACE:
348 p = backslashreplace(&writer, max_char_size, p,
349 unicode, startpos, endpos);
350 if (p == NULL)
351 goto error;
352 i += (endpos - startpos - 1);
353 break;
354
355 case _Py_ERROR_XMLCHARREFREPLACE:
356 p = xmlcharrefreplace(&writer, max_char_size, p,
357 unicode, startpos, endpos);
358 if (p == NULL)
359 goto error;
360 i += (endpos - startpos - 1);
361 break;
362
Victor Stinner01ada392015-10-01 21:54:51 +0200363 case _Py_ERROR_SURROGATEESCAPE:
364 for (k=startpos; k<endpos; k++) {
365 ch = data[k];
366 if (!(0xDC80 <= ch && ch <= 0xDCFF))
367 break;
368 *p++ = (char)(ch & 0xff);
369 }
370 if (k >= endpos) {
371 i += (endpos - startpos - 1);
372 break;
373 }
374 startpos = k;
375 assert(startpos < endpos);
376 /* fall through the default handler */
Victor Stinner01ada392015-10-01 21:54:51 +0200377 default:
378 rep = unicode_encode_call_errorhandler(
379 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
380 unicode, &exc, startpos, endpos, &newpos);
381 if (!rep)
382 goto error;
383
384 if (PyBytes_Check(rep))
385 repsize = PyBytes_GET_SIZE(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100386 else
Victor Stinner01ada392015-10-01 21:54:51 +0200387 repsize = PyUnicode_GET_LENGTH(rep);
Victor Stinner6099a032011-12-18 14:22:26 +0100388
Victor Stinner01ada392015-10-01 21:54:51 +0200389 if (repsize > max_char_size) {
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200390 p = _PyBytesWriter_Prepare(&writer, p,
391 repsize - max_char_size);
392 if (p == NULL)
Victor Stinner6099a032011-12-18 14:22:26 +0100393 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100394 }
Victor Stinner6099a032011-12-18 14:22:26 +0100395
Victor Stinner01ada392015-10-01 21:54:51 +0200396 if (PyBytes_Check(rep)) {
397 memcpy(p, PyBytes_AS_STRING(rep), repsize);
398 p += repsize;
399 }
400 else {
401 /* rep is unicode */
402 if (PyUnicode_READY(rep) < 0)
403 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100404
Victor Stinner01ada392015-10-01 21:54:51 +0200405 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100406 raise_encode_exception(&exc, "utf-8",
407 unicode,
408 i-1, i,
409 "surrogates not allowed");
410 goto error;
411 }
Victor Stinner01ada392015-10-01 21:54:51 +0200412
413 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
414 memcpy(p, PyUnicode_DATA(rep), repsize);
415 p += repsize;
Victor Stinner6099a032011-12-18 14:22:26 +0100416 }
Victor Stinner01ada392015-10-01 21:54:51 +0200417 Py_CLEAR(rep);
418
419 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100420 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200421
422 /* If overallocation was disabled, ensure that it was the last
423 write. Otherwise, we missed an optimization */
424 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100425 }
426 else
427#if STRINGLIB_SIZEOF_CHAR > 2
428 if (ch < 0x10000)
429#endif
430 {
431 *p++ = (char)(0xe0 | (ch >> 12));
432 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
433 *p++ = (char)(0x80 | (ch & 0x3f));
434 }
435#if STRINGLIB_SIZEOF_CHAR > 2
436 else /* ch >= 0x10000 */
437 {
438 assert(ch <= MAX_UNICODE);
439 /* Encode UCS4 Unicode ordinals */
440 *p++ = (char)(0xf0 | (ch >> 18));
441 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
442 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
443 *p++ = (char)(0x80 | (ch & 0x3f));
444 }
445#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
446#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
447 }
448
Victor Stinner6099a032011-12-18 14:22:26 +0100449#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200450 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100451 Py_XDECREF(exc);
452#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200453 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100454
455#if STRINGLIB_SIZEOF_CHAR > 1
456 error:
457 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200458 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100459 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200460 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100461 return NULL;
462#endif
463
464#undef MAX_SHORT_UNICHARS
465}
466
Antoine Pitrou63065d72012-05-15 23:48:04 +0200467/* The pattern for constructing UCS2-repeated masks. */
468#if SIZEOF_LONG == 8
469# define UCS2_REPEAT_MASK 0x0001000100010001ul
470#elif SIZEOF_LONG == 4
471# define UCS2_REPEAT_MASK 0x00010001ul
472#else
473# error C 'long' size should be either 4 or 8!
474#endif
475
476/* The mask for fast checking. */
477#if STRINGLIB_SIZEOF_CHAR == 1
478/* The mask for fast checking of whether a C 'long' contains a
479 non-ASCII or non-Latin1 UTF16-encoded characters. */
480# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
481#else
482/* The mask for fast checking of whether a C 'long' may contain
483 UTF16-encoded surrogate characters. This is an efficient heuristic,
484 assuming that non-surrogate characters with a code point >= 0x8000 are
485 rare in most input.
486*/
487# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
488#endif
489/* The mask for fast byte-swapping. */
490#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
491/* Swap bytes. */
492#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
493 (((value) & STRIPPED_MASK) << 8))
494
495Py_LOCAL_INLINE(Py_UCS4)
496STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
497 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
498 int native_ordering)
499{
500 Py_UCS4 ch;
501 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200502 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200503 const unsigned char *q = *inptr;
504 STRINGLIB_CHAR *p = dest + *outpos;
505 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200506#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200507 int ihi = !!native_ordering, ilo = !native_ordering;
508#else
509 int ihi = !native_ordering, ilo = !!native_ordering;
510#endif
511 --e;
512
513 while (q < e) {
514 Py_UCS4 ch2;
515 /* First check for possible aligned read of a C 'long'. Unaligned
516 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200517 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200518 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200519 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200520 while (_q < aligned_end) {
521 unsigned long block = * (unsigned long *) _q;
522 if (native_ordering) {
523 /* Can use buffer directly */
524 if (block & FAST_CHAR_MASK)
525 break;
526 }
527 else {
528 /* Need to byte-swap */
529 if (block & SWAB(FAST_CHAR_MASK))
530 break;
531#if STRINGLIB_SIZEOF_CHAR == 1
532 block >>= 8;
533#else
534 block = SWAB(block);
535#endif
536 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200537#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200538# if SIZEOF_LONG == 4
539 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
540 p[1] = (STRINGLIB_CHAR)(block >> 16);
541# elif SIZEOF_LONG == 8
542 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
543 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
544 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
545 p[3] = (STRINGLIB_CHAR)(block >> 48);
546# endif
547#else
548# if SIZEOF_LONG == 4
549 p[0] = (STRINGLIB_CHAR)(block >> 16);
550 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
551# elif SIZEOF_LONG == 8
552 p[0] = (STRINGLIB_CHAR)(block >> 48);
553 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
554 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
555 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
556# endif
557#endif
558 _q += SIZEOF_LONG;
559 p += SIZEOF_LONG / 2;
560 }
561 q = _q;
562 if (q >= e)
563 break;
564 }
565
566 ch = (q[ihi] << 8) | q[ilo];
567 q += 2;
568 if (!Py_UNICODE_IS_SURROGATE(ch)) {
569#if STRINGLIB_SIZEOF_CHAR < 2
570 if (ch > STRINGLIB_MAX_CHAR)
571 /* Out-of-range */
572 goto Return;
573#endif
574 *p++ = (STRINGLIB_CHAR)ch;
575 continue;
576 }
577
578 /* UTF-16 code pair: */
579 if (q >= e)
580 goto UnexpectedEnd;
581 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
582 goto IllegalEncoding;
583 ch2 = (q[ihi] << 8) | q[ilo];
584 q += 2;
585 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
586 goto IllegalSurrogate;
587 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
588#if STRINGLIB_SIZEOF_CHAR < 4
589 /* Out-of-range */
590 goto Return;
591#else
592 *p++ = (STRINGLIB_CHAR)ch;
593#endif
594 }
595 ch = 0;
596Return:
597 *inptr = q;
598 *outpos = p - dest;
599 return ch;
600UnexpectedEnd:
601 ch = 1;
602 goto Return;
603IllegalEncoding:
604 ch = 2;
605 goto Return;
606IllegalSurrogate:
607 ch = 3;
608 goto Return;
609}
610#undef UCS2_REPEAT_MASK
611#undef FAST_CHAR_MASK
612#undef STRIPPED_MASK
613#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200614
615
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200616#if STRINGLIB_MAX_CHAR >= 0x80
617Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200618STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
619 Py_ssize_t len,
620 unsigned short **outptr,
621 int native_ordering)
622{
623 unsigned short *out = *outptr;
624 const STRINGLIB_CHAR *end = in + len;
625#if STRINGLIB_SIZEOF_CHAR == 1
626 if (native_ordering) {
627 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
628 while (in < unrolled_end) {
629 out[0] = in[0];
630 out[1] = in[1];
631 out[2] = in[2];
632 out[3] = in[3];
633 in += 4; out += 4;
634 }
635 while (in < end) {
636 *out++ = *in++;
637 }
638 } else {
639# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200640 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200641 while (in < unrolled_end) {
642 out[0] = SWAB2(in[0]);
643 out[1] = SWAB2(in[1]);
644 out[2] = SWAB2(in[2]);
645 out[3] = SWAB2(in[3]);
646 in += 4; out += 4;
647 }
648 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200649 Py_UCS4 ch = *in++;
650 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200651 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200652#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200653 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 *outptr = out;
655 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200656#else
657 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200658#if STRINGLIB_MAX_CHAR < 0x10000
659 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
660 while (in < unrolled_end) {
661 /* check if any character is a surrogate character */
662 if (((in[0] ^ 0xd800) &
663 (in[1] ^ 0xd800) &
664 (in[2] ^ 0xd800) &
665 (in[3] ^ 0xd800) & 0xf800) == 0)
666 break;
667 out[0] = in[0];
668 out[1] = in[1];
669 out[2] = in[2];
670 out[3] = in[3];
671 in += 4; out += 4;
672 }
673#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200674 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200675 Py_UCS4 ch;
676 ch = *in++;
677 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200678 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200679 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300680 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200681 goto fail;
682#if STRINGLIB_MAX_CHAR >= 0x10000
683 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200684 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
685 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
686 out += 2;
687 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200688#endif
689 else
690 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200691 }
692 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200693#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
694#if STRINGLIB_MAX_CHAR < 0x10000
695 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
696 while (in < unrolled_end) {
697 /* check if any character is a surrogate character */
698 if (((in[0] ^ 0xd800) &
699 (in[1] ^ 0xd800) &
700 (in[2] ^ 0xd800) &
701 (in[3] ^ 0xd800) & 0xf800) == 0)
702 break;
703 out[0] = SWAB2(in[0]);
704 out[1] = SWAB2(in[1]);
705 out[2] = SWAB2(in[2]);
706 out[3] = SWAB2(in[3]);
707 in += 4; out += 4;
708 }
709#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200710 while (in < end) {
711 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200712 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200713 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200714 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300715 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716 goto fail;
717#if STRINGLIB_MAX_CHAR >= 0x10000
718 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200719 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
720 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
721 out[0] = SWAB2(ch1);
722 out[1] = SWAB2(ch2);
723 out += 2;
724 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200725#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200726 else
727 *out++ = SWAB2((Py_UCS2)ch);
728 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200729#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200730 }
731 *outptr = out;
732 return len;
733 fail:
734 *outptr = out;
735 return len - (end - in + 1);
736#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200737}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300738
739#if STRINGLIB_SIZEOF_CHAR == 1
740# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
741#elif STRINGLIB_SIZEOF_CHAR == 2
742# define SWAB4(CH, tmp) (tmp = (CH), \
743 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
744 /* high bytes are zero */
745#else
746# define SWAB4(CH, tmp) (tmp = (CH), \
747 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
748 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
749#endif
750Py_LOCAL_INLINE(Py_ssize_t)
751STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
752 Py_ssize_t len,
753 PY_UINT32_T **outptr,
754 int native_ordering)
755{
756 PY_UINT32_T *out = *outptr;
757 const STRINGLIB_CHAR *end = in + len;
758 if (native_ordering) {
759 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
760 while (in < unrolled_end) {
761#if STRINGLIB_SIZEOF_CHAR > 1
762 /* check if any character is a surrogate character */
763 if (((in[0] ^ 0xd800) &
764 (in[1] ^ 0xd800) &
765 (in[2] ^ 0xd800) &
766 (in[3] ^ 0xd800) & 0xf800) == 0)
767 break;
768#endif
769 out[0] = in[0];
770 out[1] = in[1];
771 out[2] = in[2];
772 out[3] = in[3];
773 in += 4; out += 4;
774 }
775 while (in < end) {
776 Py_UCS4 ch;
777 ch = *in++;
778#if STRINGLIB_SIZEOF_CHAR > 1
779 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300780 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300781 goto fail;
782 }
783#endif
784 *out++ = ch;
785 }
786 } else {
787 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
788 while (in < unrolled_end) {
789#if STRINGLIB_SIZEOF_CHAR > 1
790 Py_UCS4 ch1, ch2, ch3, ch4;
791 /* check if any character is a surrogate character */
792 if (((in[0] ^ 0xd800) &
793 (in[1] ^ 0xd800) &
794 (in[2] ^ 0xd800) &
795 (in[3] ^ 0xd800) & 0xf800) == 0)
796 break;
797#endif
798 out[0] = SWAB4(in[0], ch1);
799 out[1] = SWAB4(in[1], ch2);
800 out[2] = SWAB4(in[2], ch3);
801 out[3] = SWAB4(in[3], ch4);
802 in += 4; out += 4;
803 }
804 while (in < end) {
805 Py_UCS4 ch = *in++;
806#if STRINGLIB_SIZEOF_CHAR > 1
807 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300808 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300809 goto fail;
810 }
811#endif
812 *out++ = SWAB4(ch, ch);
813 }
814 }
815 *outptr = out;
816 return len;
817#if STRINGLIB_SIZEOF_CHAR > 1
818 fail:
819 *outptr = out;
820 return len - (end - in + 1);
821#endif
822}
823#undef SWAB4
824
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200825#endif
826
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100827#endif /* STRINGLIB_IS_UNICODE */