blob: 749e7652fe03ba3cf841fc9486a4ed3e4c51c1b5 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
Serhiy Storchakabcde10a2016-05-16 09:42:29 +03003#if !STRINGLIB_IS_UNICODE
4# error "codecs.h is specific to Unicode"
5#endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01006
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01007/* Mask to quickly check whether a C 'long' contains a
8 non-ASCII, UTF8-encoded char. */
9#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020012# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010013#else
14# error C 'long' size should be either 4 or 8!
15#endif
16
Mark Dickinson106c4142012-06-23 21:45:14 +010017/* 10xxxxxx */
18#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020020Py_LOCAL_INLINE(Py_UCS4)
21STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22 STRINGLIB_CHAR *dest,
23 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010024{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020025 Py_UCS4 ch;
26 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020027 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020028 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010029
30 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020031 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010032
33 if (ch < 0x80) {
34 /* Fast path for runs of ASCII characters. Given that common UTF-8
35 input will consist of an overwhelming majority of ASCII
36 characters, we try to optimize for this case by checking
37 as many characters as a C 'long' can contain.
38 First, check if we can do an aligned read, as most CPUs have
39 a penalty for unaligned reads.
40 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020041 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010042 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020043 const char *_s = s;
44 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010045 while (_s < aligned_end) {
46 /* Read a whole long at a time (either 4 or 8 bytes),
47 and do a fast unrolled copy if it only contains ASCII
48 characters. */
49 unsigned long value = *(unsigned long *) _s;
50 if (value & ASCII_CHAR_MASK)
51 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020052#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020053 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57# if SIZEOF_LONG == 8
58 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62# endif
63#else
64# if SIZEOF_LONG == 8
65 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73# else
74 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010079#endif
80 _s += SIZEOF_LONG;
81 _p += SIZEOF_LONG;
82 }
83 s = _s;
84 p = _p;
85 if (s == end)
86 break;
87 ch = (unsigned char)*s;
88 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020089 if (ch < 0x80) {
90 s++;
91 *p++ = ch;
92 continue;
93 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010094 }
95
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020096 if (ch < 0xE0) {
97 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010098 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020099 if (ch < 0xC2) {
100 /* invalid sequence
101 \x80-\xBF -- continuation byte
102 \xC0-\xC1 -- fake 0000-007F */
103 goto InvalidStart;
104 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200105 if (end - s < 2) {
106 /* unexpected end of data: the caller will decide whether
107 it's an error or not */
108 break;
109 }
110 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100111 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200113 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200114 ch = (ch << 6) + ch2 -
115 ((0xC0 << 6) + 0x80);
116 assert ((ch > 0x007F) && (ch <= 0x07FF));
117 s += 2;
118 if (STRINGLIB_MAX_CHAR <= 0x007F ||
119 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200120 /* Out-of-range */
121 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100122 *p++ = ch;
123 continue;
124 }
125
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200126 if (ch < 0xF0) {
127 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128 Py_UCS4 ch2, ch3;
129 if (end - s < 3) {
130 /* unexpected end of data: the caller will decide whether
131 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200132 if (end - s < 2)
133 break;
134 ch2 = (unsigned char)s[1];
135 if (!IS_CONTINUATION_BYTE(ch2) ||
136 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137 /* for clarification see comments below */
138 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200141 ch2 = (unsigned char)s[1];
142 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200145 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200146 }
147 if (ch == 0xE0) {
148 if (ch2 < 0xA0)
149 /* invalid sequence
150 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200151 goto InvalidContinuation1;
152 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200153 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154 will result in surrogates in range D800-DFFF. Surrogates are
155 not valid UTF-8 so they are rejected.
156 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200158 goto InvalidContinuation1;
159 }
160 if (!IS_CONTINUATION_BYTE(ch3)) {
161 /* invalid continuation byte */
162 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163 }
164 ch = (ch << 12) + (ch2 << 6) + ch3 -
165 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100166 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200168 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200170 /* Out-of-range */
171 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100174 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200175
176 if (ch < 0xF5) {
177 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178 Py_UCS4 ch2, ch3, ch4;
179 if (end - s < 4) {
180 /* unexpected end of data: the caller will decide whether
181 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200182 if (end - s < 2)
183 break;
184 ch2 = (unsigned char)s[1];
185 if (!IS_CONTINUATION_BYTE(ch2) ||
186 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187 /* for clarification see comments below */
188 goto InvalidContinuation1;
189 if (end - s < 3)
190 break;
191 ch3 = (unsigned char)s[2];
192 if (!IS_CONTINUATION_BYTE(ch3))
193 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200194 break;
195 }
196 ch2 = (unsigned char)s[1];
197 ch3 = (unsigned char)s[2];
198 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200201 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200202 }
203 if (ch == 0xF0) {
204 if (ch2 < 0x90)
205 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200206 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207 goto InvalidContinuation1;
208 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200209 /* invalid sequence
210 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200211 goto InvalidContinuation1;
212 }
213 if (!IS_CONTINUATION_BYTE(ch3)) {
214 /* invalid continuation byte */
215 goto InvalidContinuation2;
216 }
217 if (!IS_CONTINUATION_BYTE(ch4)) {
218 /* invalid continuation byte */
219 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200220 }
221 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224 s += 4;
225 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200227 /* Out-of-range */
228 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200229 *p++ = ch;
230 continue;
231 }
232 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200234 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200235Return:
236 *inptr = s;
237 *outpos = p - dest;
238 return ch;
239InvalidStart:
240 ch = 1;
241 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200242InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200243 ch = 2;
244 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200245InvalidContinuation2:
246 ch = 3;
247 goto Return;
248InvalidContinuation3:
249 ch = 4;
250 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251}
252
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100253#undef ASCII_CHAR_MASK
254
Victor Stinner6099a032011-12-18 14:22:26 +0100255
256/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258 UCS-1 strings don't need to handle surrogates for example. */
259Py_LOCAL_INLINE(PyObject *)
260STRINGLIB(utf8_encoder)(PyObject *unicode,
261 STRINGLIB_CHAR *data,
262 Py_ssize_t size,
263 const char *errors)
264{
265#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
266
267 Py_ssize_t i; /* index into s of next input byte */
Victor Stinner6099a032011-12-18 14:22:26 +0100268 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100269#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200270 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100271 PyObject *exc = NULL;
272 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200273 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100274#endif
275#if STRINGLIB_SIZEOF_CHAR == 1
276 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#elif STRINGLIB_SIZEOF_CHAR == 2
278 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#else /* STRINGLIB_SIZEOF_CHAR == 4 */
280 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100281#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200282 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100283
284 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100286
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200287 if (size > PY_SSIZE_T_MAX / max_char_size) {
288 /* integer overflow */
289 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100290 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200291
292 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
293 if (p == NULL)
294 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100295
296 for (i = 0; i < size;) {
297 Py_UCS4 ch = data[i++];
298
299 if (ch < 0x80) {
300 /* Encode ASCII */
301 *p++ = (char) ch;
302
303 }
304 else
305#if STRINGLIB_SIZEOF_CHAR > 1
306 if (ch < 0x0800)
307#endif
308 {
309 /* Encode Latin-1 */
310 *p++ = (char)(0xc0 | (ch >> 6));
311 *p++ = (char)(0x80 | (ch & 0x3f));
312 }
313#if STRINGLIB_SIZEOF_CHAR > 1
314 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200315 Py_ssize_t startpos, endpos, newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +0200316 Py_ssize_t k;
Victor Stinner01ada392015-10-01 21:54:51 +0200317 if (error_handler == _Py_ERROR_UNKNOWN)
318 error_handler = get_error_handler(errors);
319
Victor Stinner6099a032011-12-18 14:22:26 +0100320 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200321 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100322
Victor Stinner01ada392015-10-01 21:54:51 +0200323 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
324 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100325
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200326 /* Only overallocate the buffer if it's not the last write */
327 writer.overallocate = (endpos < size);
328
Victor Stinner01ada392015-10-01 21:54:51 +0200329 switch (error_handler)
330 {
331 case _Py_ERROR_REPLACE:
332 memset(p, '?', endpos - startpos);
333 p += (endpos - startpos);
334 /* fall through the ignore handler */
335 case _Py_ERROR_IGNORE:
336 i += (endpos - startpos - 1);
337 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100338
Victor Stinner01ada392015-10-01 21:54:51 +0200339 case _Py_ERROR_SURROGATEPASS:
340 for (k=startpos; k<endpos; k++) {
341 ch = data[k];
342 *p++ = (char)(0xe0 | (ch >> 12));
343 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
344 *p++ = (char)(0x80 | (ch & 0x3f));
345 }
346 i += (endpos - startpos - 1);
347 break;
348
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200349 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700350 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200351 writer.min_size -= max_char_size * (endpos - startpos);
352 p = backslashreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200353 unicode, startpos, endpos);
354 if (p == NULL)
355 goto error;
356 i += (endpos - startpos - 1);
357 break;
358
359 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700360 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200361 writer.min_size -= max_char_size * (endpos - startpos);
362 p = xmlcharrefreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200363 unicode, startpos, endpos);
364 if (p == NULL)
365 goto error;
366 i += (endpos - startpos - 1);
367 break;
368
Victor Stinner01ada392015-10-01 21:54:51 +0200369 case _Py_ERROR_SURROGATEESCAPE:
370 for (k=startpos; k<endpos; k++) {
371 ch = data[k];
372 if (!(0xDC80 <= ch && ch <= 0xDCFF))
373 break;
374 *p++ = (char)(ch & 0xff);
375 }
376 if (k >= endpos) {
377 i += (endpos - startpos - 1);
378 break;
379 }
380 startpos = k;
381 assert(startpos < endpos);
382 /* fall through the default handler */
Victor Stinner01ada392015-10-01 21:54:51 +0200383 default:
384 rep = unicode_encode_call_errorhandler(
385 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
386 unicode, &exc, startpos, endpos, &newpos);
387 if (!rep)
388 goto error;
389
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700390 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200391 writer.min_size -= max_char_size;
392
Victor Stinner01ada392015-10-01 21:54:51 +0200393 if (PyBytes_Check(rep)) {
Victor Stinnerce179bf2015-10-09 12:57:22 +0200394 p = _PyBytesWriter_WriteBytes(&writer, p,
395 PyBytes_AS_STRING(rep),
396 PyBytes_GET_SIZE(rep));
Victor Stinner01ada392015-10-01 21:54:51 +0200397 }
398 else {
399 /* rep is unicode */
400 if (PyUnicode_READY(rep) < 0)
401 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100402
Victor Stinner01ada392015-10-01 21:54:51 +0200403 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100404 raise_encode_exception(&exc, "utf-8",
405 unicode,
406 i-1, i,
407 "surrogates not allowed");
408 goto error;
409 }
Victor Stinner01ada392015-10-01 21:54:51 +0200410
411 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Victor Stinner6bd525b2015-10-09 13:10:05 +0200412 p = _PyBytesWriter_WriteBytes(&writer, p,
413 PyUnicode_DATA(rep),
414 PyUnicode_GET_LENGTH(rep));
Victor Stinner6099a032011-12-18 14:22:26 +0100415 }
Victor Stinner6bd525b2015-10-09 13:10:05 +0200416
417 if (p == NULL)
418 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200419 Py_CLEAR(rep);
420
421 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100422 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200423
424 /* If overallocation was disabled, ensure that it was the last
425 write. Otherwise, we missed an optimization */
426 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100427 }
428 else
429#if STRINGLIB_SIZEOF_CHAR > 2
430 if (ch < 0x10000)
431#endif
432 {
433 *p++ = (char)(0xe0 | (ch >> 12));
434 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
435 *p++ = (char)(0x80 | (ch & 0x3f));
436 }
437#if STRINGLIB_SIZEOF_CHAR > 2
438 else /* ch >= 0x10000 */
439 {
440 assert(ch <= MAX_UNICODE);
441 /* Encode UCS4 Unicode ordinals */
442 *p++ = (char)(0xf0 | (ch >> 18));
443 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
444 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
445 *p++ = (char)(0x80 | (ch & 0x3f));
446 }
447#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
448#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
449 }
450
Victor Stinner6099a032011-12-18 14:22:26 +0100451#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200452 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100453 Py_XDECREF(exc);
454#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200455 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100456
457#if STRINGLIB_SIZEOF_CHAR > 1
458 error:
459 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200460 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100461 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200462 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100463 return NULL;
464#endif
465
466#undef MAX_SHORT_UNICHARS
467}
468
Antoine Pitrou63065d72012-05-15 23:48:04 +0200469/* The pattern for constructing UCS2-repeated masks. */
470#if SIZEOF_LONG == 8
471# define UCS2_REPEAT_MASK 0x0001000100010001ul
472#elif SIZEOF_LONG == 4
473# define UCS2_REPEAT_MASK 0x00010001ul
474#else
475# error C 'long' size should be either 4 or 8!
476#endif
477
478/* The mask for fast checking. */
479#if STRINGLIB_SIZEOF_CHAR == 1
480/* The mask for fast checking of whether a C 'long' contains a
481 non-ASCII or non-Latin1 UTF16-encoded characters. */
482# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
483#else
484/* The mask for fast checking of whether a C 'long' may contain
485 UTF16-encoded surrogate characters. This is an efficient heuristic,
486 assuming that non-surrogate characters with a code point >= 0x8000 are
487 rare in most input.
488*/
489# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
490#endif
491/* The mask for fast byte-swapping. */
492#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
493/* Swap bytes. */
494#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
495 (((value) & STRIPPED_MASK) << 8))
496
497Py_LOCAL_INLINE(Py_UCS4)
498STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
499 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
500 int native_ordering)
501{
502 Py_UCS4 ch;
503 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200504 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200505 const unsigned char *q = *inptr;
506 STRINGLIB_CHAR *p = dest + *outpos;
507 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200508#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200509 int ihi = !!native_ordering, ilo = !native_ordering;
510#else
511 int ihi = !native_ordering, ilo = !!native_ordering;
512#endif
513 --e;
514
515 while (q < e) {
516 Py_UCS4 ch2;
517 /* First check for possible aligned read of a C 'long'. Unaligned
518 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200519 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200520 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200521 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200522 while (_q < aligned_end) {
523 unsigned long block = * (unsigned long *) _q;
524 if (native_ordering) {
525 /* Can use buffer directly */
526 if (block & FAST_CHAR_MASK)
527 break;
528 }
529 else {
530 /* Need to byte-swap */
531 if (block & SWAB(FAST_CHAR_MASK))
532 break;
533#if STRINGLIB_SIZEOF_CHAR == 1
534 block >>= 8;
535#else
536 block = SWAB(block);
537#endif
538 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200539#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200540# if SIZEOF_LONG == 4
541 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
542 p[1] = (STRINGLIB_CHAR)(block >> 16);
543# elif SIZEOF_LONG == 8
544 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
545 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
546 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
547 p[3] = (STRINGLIB_CHAR)(block >> 48);
548# endif
549#else
550# if SIZEOF_LONG == 4
551 p[0] = (STRINGLIB_CHAR)(block >> 16);
552 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
553# elif SIZEOF_LONG == 8
554 p[0] = (STRINGLIB_CHAR)(block >> 48);
555 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
556 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
557 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
558# endif
559#endif
560 _q += SIZEOF_LONG;
561 p += SIZEOF_LONG / 2;
562 }
563 q = _q;
564 if (q >= e)
565 break;
566 }
567
568 ch = (q[ihi] << 8) | q[ilo];
569 q += 2;
570 if (!Py_UNICODE_IS_SURROGATE(ch)) {
571#if STRINGLIB_SIZEOF_CHAR < 2
572 if (ch > STRINGLIB_MAX_CHAR)
573 /* Out-of-range */
574 goto Return;
575#endif
576 *p++ = (STRINGLIB_CHAR)ch;
577 continue;
578 }
579
580 /* UTF-16 code pair: */
581 if (q >= e)
582 goto UnexpectedEnd;
583 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
584 goto IllegalEncoding;
585 ch2 = (q[ihi] << 8) | q[ilo];
586 q += 2;
587 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
588 goto IllegalSurrogate;
589 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
590#if STRINGLIB_SIZEOF_CHAR < 4
591 /* Out-of-range */
592 goto Return;
593#else
594 *p++ = (STRINGLIB_CHAR)ch;
595#endif
596 }
597 ch = 0;
598Return:
599 *inptr = q;
600 *outpos = p - dest;
601 return ch;
602UnexpectedEnd:
603 ch = 1;
604 goto Return;
605IllegalEncoding:
606 ch = 2;
607 goto Return;
608IllegalSurrogate:
609 ch = 3;
610 goto Return;
611}
612#undef UCS2_REPEAT_MASK
613#undef FAST_CHAR_MASK
614#undef STRIPPED_MASK
615#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200616
617
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200618#if STRINGLIB_MAX_CHAR >= 0x80
619Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200620STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
621 Py_ssize_t len,
622 unsigned short **outptr,
623 int native_ordering)
624{
625 unsigned short *out = *outptr;
626 const STRINGLIB_CHAR *end = in + len;
627#if STRINGLIB_SIZEOF_CHAR == 1
628 if (native_ordering) {
629 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
630 while (in < unrolled_end) {
631 out[0] = in[0];
632 out[1] = in[1];
633 out[2] = in[2];
634 out[3] = in[3];
635 in += 4; out += 4;
636 }
637 while (in < end) {
638 *out++ = *in++;
639 }
640 } else {
641# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200642 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200643 while (in < unrolled_end) {
644 out[0] = SWAB2(in[0]);
645 out[1] = SWAB2(in[1]);
646 out[2] = SWAB2(in[2]);
647 out[3] = SWAB2(in[3]);
648 in += 4; out += 4;
649 }
650 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200651 Py_UCS4 ch = *in++;
652 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200653 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200655 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656 *outptr = out;
657 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200658#else
659 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200660#if STRINGLIB_MAX_CHAR < 0x10000
661 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
662 while (in < unrolled_end) {
663 /* check if any character is a surrogate character */
664 if (((in[0] ^ 0xd800) &
665 (in[1] ^ 0xd800) &
666 (in[2] ^ 0xd800) &
667 (in[3] ^ 0xd800) & 0xf800) == 0)
668 break;
669 out[0] = in[0];
670 out[1] = in[1];
671 out[2] = in[2];
672 out[3] = in[3];
673 in += 4; out += 4;
674 }
675#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200676 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200677 Py_UCS4 ch;
678 ch = *in++;
679 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200680 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200681 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300682 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200683 goto fail;
684#if STRINGLIB_MAX_CHAR >= 0x10000
685 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200686 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
687 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
688 out += 2;
689 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200690#endif
691 else
692 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200693 }
694 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200695#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
696#if STRINGLIB_MAX_CHAR < 0x10000
697 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
698 while (in < unrolled_end) {
699 /* check if any character is a surrogate character */
700 if (((in[0] ^ 0xd800) &
701 (in[1] ^ 0xd800) &
702 (in[2] ^ 0xd800) &
703 (in[3] ^ 0xd800) & 0xf800) == 0)
704 break;
705 out[0] = SWAB2(in[0]);
706 out[1] = SWAB2(in[1]);
707 out[2] = SWAB2(in[2]);
708 out[3] = SWAB2(in[3]);
709 in += 4; out += 4;
710 }
711#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200712 while (in < end) {
713 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200714 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200715 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200716 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300717 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200718 goto fail;
719#if STRINGLIB_MAX_CHAR >= 0x10000
720 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200721 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
722 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
723 out[0] = SWAB2(ch1);
724 out[1] = SWAB2(ch2);
725 out += 2;
726 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200727#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200728 else
729 *out++ = SWAB2((Py_UCS2)ch);
730 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200731#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200732 }
733 *outptr = out;
734 return len;
735 fail:
736 *outptr = out;
737 return len - (end - in + 1);
738#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200739}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300740
741#if STRINGLIB_SIZEOF_CHAR == 1
742# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
743#elif STRINGLIB_SIZEOF_CHAR == 2
744# define SWAB4(CH, tmp) (tmp = (CH), \
745 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
746 /* high bytes are zero */
747#else
748# define SWAB4(CH, tmp) (tmp = (CH), \
749 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
750 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
751#endif
752Py_LOCAL_INLINE(Py_ssize_t)
753STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
754 Py_ssize_t len,
755 PY_UINT32_T **outptr,
756 int native_ordering)
757{
758 PY_UINT32_T *out = *outptr;
759 const STRINGLIB_CHAR *end = in + len;
760 if (native_ordering) {
761 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
762 while (in < unrolled_end) {
763#if STRINGLIB_SIZEOF_CHAR > 1
764 /* check if any character is a surrogate character */
765 if (((in[0] ^ 0xd800) &
766 (in[1] ^ 0xd800) &
767 (in[2] ^ 0xd800) &
768 (in[3] ^ 0xd800) & 0xf800) == 0)
769 break;
770#endif
771 out[0] = in[0];
772 out[1] = in[1];
773 out[2] = in[2];
774 out[3] = in[3];
775 in += 4; out += 4;
776 }
777 while (in < end) {
778 Py_UCS4 ch;
779 ch = *in++;
780#if STRINGLIB_SIZEOF_CHAR > 1
781 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300782 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300783 goto fail;
784 }
785#endif
786 *out++ = ch;
787 }
788 } else {
789 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
790 while (in < unrolled_end) {
791#if STRINGLIB_SIZEOF_CHAR > 1
792 Py_UCS4 ch1, ch2, ch3, ch4;
793 /* check if any character is a surrogate character */
794 if (((in[0] ^ 0xd800) &
795 (in[1] ^ 0xd800) &
796 (in[2] ^ 0xd800) &
797 (in[3] ^ 0xd800) & 0xf800) == 0)
798 break;
799#endif
800 out[0] = SWAB4(in[0], ch1);
801 out[1] = SWAB4(in[1], ch2);
802 out[2] = SWAB4(in[2], ch3);
803 out[3] = SWAB4(in[3], ch4);
804 in += 4; out += 4;
805 }
806 while (in < end) {
807 Py_UCS4 ch = *in++;
808#if STRINGLIB_SIZEOF_CHAR > 1
809 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300810 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300811 goto fail;
812 }
813#endif
814 *out++ = SWAB4(ch, ch);
815 }
816 }
817 *outptr = out;
818 return len;
819#if STRINGLIB_SIZEOF_CHAR > 1
820 fail:
821 *outptr = out;
822 return len - (end - in + 1);
823#endif
824}
825#undef SWAB4
826
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200827#endif