blob: 7e8d928e20bcc1fb853331db32491250e25a534c [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
3#if STRINGLIB_IS_UNICODE
4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005/* Mask to quickly check whether a C 'long' contains a
6 non-ASCII, UTF8-encoded char. */
7#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02008# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01009#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#else
12# error C 'long' size should be either 4 or 8!
13#endif
14
Mark Dickinson106c4142012-06-23 21:45:14 +010015/* 10xxxxxx */
16#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
17
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020018Py_LOCAL_INLINE(Py_UCS4)
19STRINGLIB(utf8_decode)(const char **inptr, const char *end,
20 STRINGLIB_CHAR *dest,
21 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010022{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020023 Py_UCS4 ch;
24 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020026 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010027
28 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020029 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010030
31 if (ch < 0x80) {
32 /* Fast path for runs of ASCII characters. Given that common UTF-8
33 input will consist of an overwhelming majority of ASCII
34 characters, we try to optimize for this case by checking
35 as many characters as a C 'long' can contain.
36 First, check if we can do an aligned read, as most CPUs have
37 a penalty for unaligned reads.
38 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020039 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010040 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020041 const char *_s = s;
42 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010043 while (_s < aligned_end) {
44 /* Read a whole long at a time (either 4 or 8 bytes),
45 and do a fast unrolled copy if it only contains ASCII
46 characters. */
47 unsigned long value = *(unsigned long *) _s;
48 if (value & ASCII_CHAR_MASK)
49 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020050#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020051 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
52 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
53 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
54 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
55# if SIZEOF_LONG == 8
56 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
57 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
58 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
59 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
60# endif
61#else
62# if SIZEOF_LONG == 8
63 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
65 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
66 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
67 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
68 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
69 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
70 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
71# else
72 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
73 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
74 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
75 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
76# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010077#endif
78 _s += SIZEOF_LONG;
79 _p += SIZEOF_LONG;
80 }
81 s = _s;
82 p = _p;
83 if (s == end)
84 break;
85 ch = (unsigned char)*s;
86 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020087 if (ch < 0x80) {
88 s++;
89 *p++ = ch;
90 continue;
91 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010092 }
93
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020094 if (ch < 0xE0) {
95 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010096 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020097 if (ch < 0xC2) {
98 /* invalid sequence
99 \x80-\xBF -- continuation byte
100 \xC0-\xC1 -- fake 0000-007F */
101 goto InvalidStart;
102 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200103 if (end - s < 2) {
104 /* unexpected end of data: the caller will decide whether
105 it's an error or not */
106 break;
107 }
108 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100109 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200110 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200111 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 ch = (ch << 6) + ch2 -
113 ((0xC0 << 6) + 0x80);
114 assert ((ch > 0x007F) && (ch <= 0x07FF));
115 s += 2;
116 if (STRINGLIB_MAX_CHAR <= 0x007F ||
117 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200118 /* Out-of-range */
119 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100120 *p++ = ch;
121 continue;
122 }
123
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200124 if (ch < 0xF0) {
125 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
126 Py_UCS4 ch2, ch3;
127 if (end - s < 3) {
128 /* unexpected end of data: the caller will decide whether
129 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200130 if (end - s < 2)
131 break;
132 ch2 = (unsigned char)s[1];
133 if (!IS_CONTINUATION_BYTE(ch2) ||
134 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
135 /* for clarification see comments below */
136 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200137 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 ch2 = (unsigned char)s[1];
140 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200141 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200142 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 }
145 if (ch == 0xE0) {
146 if (ch2 < 0xA0)
147 /* invalid sequence
148 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200149 goto InvalidContinuation1;
150 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200151 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
152 will result in surrogates in range D800-DFFF. Surrogates are
153 not valid UTF-8 so they are rejected.
154 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
155 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200156 goto InvalidContinuation1;
157 }
158 if (!IS_CONTINUATION_BYTE(ch3)) {
159 /* invalid continuation byte */
160 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200161 }
162 ch = (ch << 12) + (ch2 << 6) + ch3 -
163 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100164 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
165 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200166 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
167 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200168 /* Out-of-range */
169 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100170 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200171 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173
174 if (ch < 0xF5) {
175 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
176 Py_UCS4 ch2, ch3, ch4;
177 if (end - s < 4) {
178 /* unexpected end of data: the caller will decide whether
179 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200180 if (end - s < 2)
181 break;
182 ch2 = (unsigned char)s[1];
183 if (!IS_CONTINUATION_BYTE(ch2) ||
184 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
185 /* for clarification see comments below */
186 goto InvalidContinuation1;
187 if (end - s < 3)
188 break;
189 ch3 = (unsigned char)s[2];
190 if (!IS_CONTINUATION_BYTE(ch3))
191 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200192 break;
193 }
194 ch2 = (unsigned char)s[1];
195 ch3 = (unsigned char)s[2];
196 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200197 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200198 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 }
201 if (ch == 0xF0) {
202 if (ch2 < 0x90)
203 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200204 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
205 goto InvalidContinuation1;
206 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200207 /* invalid sequence
208 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200209 goto InvalidContinuation1;
210 }
211 if (!IS_CONTINUATION_BYTE(ch3)) {
212 /* invalid continuation byte */
213 goto InvalidContinuation2;
214 }
215 if (!IS_CONTINUATION_BYTE(ch4)) {
216 /* invalid continuation byte */
217 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200218 }
219 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
220 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
221 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
222 s += 4;
223 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
224 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200225 /* Out-of-range */
226 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200227 *p++ = ch;
228 continue;
229 }
230 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200232 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200233Return:
234 *inptr = s;
235 *outpos = p - dest;
236 return ch;
237InvalidStart:
238 ch = 1;
239 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200240InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200241 ch = 2;
242 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200243InvalidContinuation2:
244 ch = 3;
245 goto Return;
246InvalidContinuation3:
247 ch = 4;
248 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100249}
250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251#undef ASCII_CHAR_MASK
252
Victor Stinner6099a032011-12-18 14:22:26 +0100253
254/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
255 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
256 UCS-1 strings don't need to handle surrogates for example. */
257Py_LOCAL_INLINE(PyObject *)
258STRINGLIB(utf8_encoder)(PyObject *unicode,
259 STRINGLIB_CHAR *data,
260 Py_ssize_t size,
261 const char *errors)
262{
263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
264
265 Py_ssize_t i; /* index into s of next input byte */
Victor Stinner6099a032011-12-18 14:22:26 +0100266 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100267#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200268 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100269 PyObject *exc = NULL;
270 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200271 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100272#endif
273#if STRINGLIB_SIZEOF_CHAR == 1
274 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100275#elif STRINGLIB_SIZEOF_CHAR == 2
276 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200280 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100281
282 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200283 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100284
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 if (size > PY_SSIZE_T_MAX / max_char_size) {
286 /* integer overflow */
287 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100288 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200289
290 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291 if (p == NULL)
292 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303#if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305#endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311#if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200313 Py_ssize_t startpos, endpos, newpos;
314 Py_ssize_t repsize, k;
315 if (error_handler == _Py_ERROR_UNKNOWN)
316 error_handler = get_error_handler(errors);
317
Victor Stinner6099a032011-12-18 14:22:26 +0100318 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200319 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100320
Victor Stinner01ada392015-10-01 21:54:51 +0200321 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
322 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100323
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200324 /* Only overallocate the buffer if it's not the last write */
325 writer.overallocate = (endpos < size);
326
Victor Stinner01ada392015-10-01 21:54:51 +0200327 switch (error_handler)
328 {
329 case _Py_ERROR_REPLACE:
330 memset(p, '?', endpos - startpos);
331 p += (endpos - startpos);
332 /* fall through the ignore handler */
333 case _Py_ERROR_IGNORE:
334 i += (endpos - startpos - 1);
335 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100336
Victor Stinner01ada392015-10-01 21:54:51 +0200337 case _Py_ERROR_SURROGATEPASS:
338 for (k=startpos; k<endpos; k++) {
339 ch = data[k];
340 *p++ = (char)(0xe0 | (ch >> 12));
341 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
342 *p++ = (char)(0x80 | (ch & 0x3f));
343 }
344 i += (endpos - startpos - 1);
345 break;
346
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200347 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +0200348 /* substract preallocated bytes */
349 writer.min_size -= max_char_size * (endpos - startpos);
350 p = backslashreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 unicode, startpos, endpos);
352 if (p == NULL)
353 goto error;
354 i += (endpos - startpos - 1);
355 break;
356
357 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +0200358 /* substract preallocated bytes */
359 writer.min_size -= max_char_size * (endpos - startpos);
360 p = xmlcharrefreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200361 unicode, startpos, endpos);
362 if (p == NULL)
363 goto error;
364 i += (endpos - startpos - 1);
365 break;
366
Victor Stinner01ada392015-10-01 21:54:51 +0200367 case _Py_ERROR_SURROGATEESCAPE:
368 for (k=startpos; k<endpos; k++) {
369 ch = data[k];
370 if (!(0xDC80 <= ch && ch <= 0xDCFF))
371 break;
372 *p++ = (char)(ch & 0xff);
373 }
374 if (k >= endpos) {
375 i += (endpos - startpos - 1);
376 break;
377 }
378 startpos = k;
379 assert(startpos < endpos);
380 /* fall through the default handler */
Victor Stinner01ada392015-10-01 21:54:51 +0200381 default:
382 rep = unicode_encode_call_errorhandler(
383 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
384 unicode, &exc, startpos, endpos, &newpos);
385 if (!rep)
386 goto error;
387
Victor Stinnerad771582015-10-09 12:38:53 +0200388 /* substract preallocated bytes */
389 writer.min_size -= max_char_size;
390
Victor Stinner01ada392015-10-01 21:54:51 +0200391 if (PyBytes_Check(rep)) {
Victor Stinnerce179bf2015-10-09 12:57:22 +0200392 p = _PyBytesWriter_WriteBytes(&writer, p,
393 PyBytes_AS_STRING(rep),
394 PyBytes_GET_SIZE(rep));
395 if (p == NULL)
396 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200397 }
398 else {
399 /* rep is unicode */
400 if (PyUnicode_READY(rep) < 0)
401 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100402
Victor Stinnerce179bf2015-10-09 12:57:22 +0200403 repsize = PyUnicode_GET_LENGTH(rep);
404
405 p = _PyBytesWriter_Prepare(&writer, p, repsize);
406 if (p == NULL)
407 goto error;
408
Victor Stinner01ada392015-10-01 21:54:51 +0200409 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100410 raise_encode_exception(&exc, "utf-8",
411 unicode,
412 i-1, i,
413 "surrogates not allowed");
414 goto error;
415 }
Victor Stinner01ada392015-10-01 21:54:51 +0200416
417 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
418 memcpy(p, PyUnicode_DATA(rep), repsize);
419 p += repsize;
Victor Stinner6099a032011-12-18 14:22:26 +0100420 }
Victor Stinner01ada392015-10-01 21:54:51 +0200421 Py_CLEAR(rep);
422
423 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100424 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200425
426 /* If overallocation was disabled, ensure that it was the last
427 write. Otherwise, we missed an optimization */
428 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100429 }
430 else
431#if STRINGLIB_SIZEOF_CHAR > 2
432 if (ch < 0x10000)
433#endif
434 {
435 *p++ = (char)(0xe0 | (ch >> 12));
436 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
437 *p++ = (char)(0x80 | (ch & 0x3f));
438 }
439#if STRINGLIB_SIZEOF_CHAR > 2
440 else /* ch >= 0x10000 */
441 {
442 assert(ch <= MAX_UNICODE);
443 /* Encode UCS4 Unicode ordinals */
444 *p++ = (char)(0xf0 | (ch >> 18));
445 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
446 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
447 *p++ = (char)(0x80 | (ch & 0x3f));
448 }
449#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
450#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
451 }
452
Victor Stinner6099a032011-12-18 14:22:26 +0100453#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200454 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100455 Py_XDECREF(exc);
456#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200457 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100458
459#if STRINGLIB_SIZEOF_CHAR > 1
460 error:
461 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200462 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100463 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200464 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100465 return NULL;
466#endif
467
468#undef MAX_SHORT_UNICHARS
469}
470
Antoine Pitrou63065d72012-05-15 23:48:04 +0200471/* The pattern for constructing UCS2-repeated masks. */
472#if SIZEOF_LONG == 8
473# define UCS2_REPEAT_MASK 0x0001000100010001ul
474#elif SIZEOF_LONG == 4
475# define UCS2_REPEAT_MASK 0x00010001ul
476#else
477# error C 'long' size should be either 4 or 8!
478#endif
479
480/* The mask for fast checking. */
481#if STRINGLIB_SIZEOF_CHAR == 1
482/* The mask for fast checking of whether a C 'long' contains a
483 non-ASCII or non-Latin1 UTF16-encoded characters. */
484# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
485#else
486/* The mask for fast checking of whether a C 'long' may contain
487 UTF16-encoded surrogate characters. This is an efficient heuristic,
488 assuming that non-surrogate characters with a code point >= 0x8000 are
489 rare in most input.
490*/
491# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
492#endif
493/* The mask for fast byte-swapping. */
494#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
495/* Swap bytes. */
496#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
497 (((value) & STRIPPED_MASK) << 8))
498
499Py_LOCAL_INLINE(Py_UCS4)
500STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
501 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
502 int native_ordering)
503{
504 Py_UCS4 ch;
505 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200506 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200507 const unsigned char *q = *inptr;
508 STRINGLIB_CHAR *p = dest + *outpos;
509 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200510#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200511 int ihi = !!native_ordering, ilo = !native_ordering;
512#else
513 int ihi = !native_ordering, ilo = !!native_ordering;
514#endif
515 --e;
516
517 while (q < e) {
518 Py_UCS4 ch2;
519 /* First check for possible aligned read of a C 'long'. Unaligned
520 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200521 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200522 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200523 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200524 while (_q < aligned_end) {
525 unsigned long block = * (unsigned long *) _q;
526 if (native_ordering) {
527 /* Can use buffer directly */
528 if (block & FAST_CHAR_MASK)
529 break;
530 }
531 else {
532 /* Need to byte-swap */
533 if (block & SWAB(FAST_CHAR_MASK))
534 break;
535#if STRINGLIB_SIZEOF_CHAR == 1
536 block >>= 8;
537#else
538 block = SWAB(block);
539#endif
540 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200541#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200542# if SIZEOF_LONG == 4
543 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
544 p[1] = (STRINGLIB_CHAR)(block >> 16);
545# elif SIZEOF_LONG == 8
546 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
547 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
548 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
549 p[3] = (STRINGLIB_CHAR)(block >> 48);
550# endif
551#else
552# if SIZEOF_LONG == 4
553 p[0] = (STRINGLIB_CHAR)(block >> 16);
554 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
555# elif SIZEOF_LONG == 8
556 p[0] = (STRINGLIB_CHAR)(block >> 48);
557 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
558 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
559 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
560# endif
561#endif
562 _q += SIZEOF_LONG;
563 p += SIZEOF_LONG / 2;
564 }
565 q = _q;
566 if (q >= e)
567 break;
568 }
569
570 ch = (q[ihi] << 8) | q[ilo];
571 q += 2;
572 if (!Py_UNICODE_IS_SURROGATE(ch)) {
573#if STRINGLIB_SIZEOF_CHAR < 2
574 if (ch > STRINGLIB_MAX_CHAR)
575 /* Out-of-range */
576 goto Return;
577#endif
578 *p++ = (STRINGLIB_CHAR)ch;
579 continue;
580 }
581
582 /* UTF-16 code pair: */
583 if (q >= e)
584 goto UnexpectedEnd;
585 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
586 goto IllegalEncoding;
587 ch2 = (q[ihi] << 8) | q[ilo];
588 q += 2;
589 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
590 goto IllegalSurrogate;
591 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
592#if STRINGLIB_SIZEOF_CHAR < 4
593 /* Out-of-range */
594 goto Return;
595#else
596 *p++ = (STRINGLIB_CHAR)ch;
597#endif
598 }
599 ch = 0;
600Return:
601 *inptr = q;
602 *outpos = p - dest;
603 return ch;
604UnexpectedEnd:
605 ch = 1;
606 goto Return;
607IllegalEncoding:
608 ch = 2;
609 goto Return;
610IllegalSurrogate:
611 ch = 3;
612 goto Return;
613}
614#undef UCS2_REPEAT_MASK
615#undef FAST_CHAR_MASK
616#undef STRIPPED_MASK
617#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200618
619
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200620#if STRINGLIB_MAX_CHAR >= 0x80
621Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200622STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
623 Py_ssize_t len,
624 unsigned short **outptr,
625 int native_ordering)
626{
627 unsigned short *out = *outptr;
628 const STRINGLIB_CHAR *end = in + len;
629#if STRINGLIB_SIZEOF_CHAR == 1
630 if (native_ordering) {
631 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
632 while (in < unrolled_end) {
633 out[0] = in[0];
634 out[1] = in[1];
635 out[2] = in[2];
636 out[3] = in[3];
637 in += 4; out += 4;
638 }
639 while (in < end) {
640 *out++ = *in++;
641 }
642 } else {
643# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200644 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200645 while (in < unrolled_end) {
646 out[0] = SWAB2(in[0]);
647 out[1] = SWAB2(in[1]);
648 out[2] = SWAB2(in[2]);
649 out[3] = SWAB2(in[3]);
650 in += 4; out += 4;
651 }
652 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200653 Py_UCS4 ch = *in++;
654 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200655 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200657 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200658 *outptr = out;
659 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200660#else
661 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200662#if STRINGLIB_MAX_CHAR < 0x10000
663 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
664 while (in < unrolled_end) {
665 /* check if any character is a surrogate character */
666 if (((in[0] ^ 0xd800) &
667 (in[1] ^ 0xd800) &
668 (in[2] ^ 0xd800) &
669 (in[3] ^ 0xd800) & 0xf800) == 0)
670 break;
671 out[0] = in[0];
672 out[1] = in[1];
673 out[2] = in[2];
674 out[3] = in[3];
675 in += 4; out += 4;
676 }
677#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200678 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200679 Py_UCS4 ch;
680 ch = *in++;
681 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200682 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200683 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300684 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200685 goto fail;
686#if STRINGLIB_MAX_CHAR >= 0x10000
687 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200688 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
689 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
690 out += 2;
691 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200692#endif
693 else
694 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200695 }
696 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200697#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
698#if STRINGLIB_MAX_CHAR < 0x10000
699 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
700 while (in < unrolled_end) {
701 /* check if any character is a surrogate character */
702 if (((in[0] ^ 0xd800) &
703 (in[1] ^ 0xd800) &
704 (in[2] ^ 0xd800) &
705 (in[3] ^ 0xd800) & 0xf800) == 0)
706 break;
707 out[0] = SWAB2(in[0]);
708 out[1] = SWAB2(in[1]);
709 out[2] = SWAB2(in[2]);
710 out[3] = SWAB2(in[3]);
711 in += 4; out += 4;
712 }
713#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200714 while (in < end) {
715 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200717 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200718 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300719 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200720 goto fail;
721#if STRINGLIB_MAX_CHAR >= 0x10000
722 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200723 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
724 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
725 out[0] = SWAB2(ch1);
726 out[1] = SWAB2(ch2);
727 out += 2;
728 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200729#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200730 else
731 *out++ = SWAB2((Py_UCS2)ch);
732 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200733#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 }
735 *outptr = out;
736 return len;
737 fail:
738 *outptr = out;
739 return len - (end - in + 1);
740#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200741}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300742
743#if STRINGLIB_SIZEOF_CHAR == 1
744# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
745#elif STRINGLIB_SIZEOF_CHAR == 2
746# define SWAB4(CH, tmp) (tmp = (CH), \
747 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
748 /* high bytes are zero */
749#else
750# define SWAB4(CH, tmp) (tmp = (CH), \
751 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
752 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
753#endif
754Py_LOCAL_INLINE(Py_ssize_t)
755STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
756 Py_ssize_t len,
757 PY_UINT32_T **outptr,
758 int native_ordering)
759{
760 PY_UINT32_T *out = *outptr;
761 const STRINGLIB_CHAR *end = in + len;
762 if (native_ordering) {
763 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
764 while (in < unrolled_end) {
765#if STRINGLIB_SIZEOF_CHAR > 1
766 /* check if any character is a surrogate character */
767 if (((in[0] ^ 0xd800) &
768 (in[1] ^ 0xd800) &
769 (in[2] ^ 0xd800) &
770 (in[3] ^ 0xd800) & 0xf800) == 0)
771 break;
772#endif
773 out[0] = in[0];
774 out[1] = in[1];
775 out[2] = in[2];
776 out[3] = in[3];
777 in += 4; out += 4;
778 }
779 while (in < end) {
780 Py_UCS4 ch;
781 ch = *in++;
782#if STRINGLIB_SIZEOF_CHAR > 1
783 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300784 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300785 goto fail;
786 }
787#endif
788 *out++ = ch;
789 }
790 } else {
791 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
792 while (in < unrolled_end) {
793#if STRINGLIB_SIZEOF_CHAR > 1
794 Py_UCS4 ch1, ch2, ch3, ch4;
795 /* check if any character is a surrogate character */
796 if (((in[0] ^ 0xd800) &
797 (in[1] ^ 0xd800) &
798 (in[2] ^ 0xd800) &
799 (in[3] ^ 0xd800) & 0xf800) == 0)
800 break;
801#endif
802 out[0] = SWAB4(in[0], ch1);
803 out[1] = SWAB4(in[1], ch2);
804 out[2] = SWAB4(in[2], ch3);
805 out[3] = SWAB4(in[3], ch4);
806 in += 4; out += 4;
807 }
808 while (in < end) {
809 Py_UCS4 ch = *in++;
810#if STRINGLIB_SIZEOF_CHAR > 1
811 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300812 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300813 goto fail;
814 }
815#endif
816 *out++ = SWAB4(ch, ch);
817 }
818 }
819 *outptr = out;
820 return len;
821#if STRINGLIB_SIZEOF_CHAR > 1
822 fail:
823 *outptr = out;
824 return len - (end - in + 1);
825#endif
826}
827#undef SWAB4
828
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200829#endif
830
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100831#endif /* STRINGLIB_IS_UNICODE */