blob: a9d0a349d967059463eeb7b81dff79c6c93189ef [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
Serhiy Storchakabcde10a2016-05-16 09:42:29 +03003#if !STRINGLIB_IS_UNICODE
4# error "codecs.h is specific to Unicode"
5#endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01006
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01007/* Mask to quickly check whether a C 'long' contains a
8 non-ASCII, UTF8-encoded char. */
9#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020012# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010013#else
14# error C 'long' size should be either 4 or 8!
15#endif
16
Mark Dickinson106c4142012-06-23 21:45:14 +010017/* 10xxxxxx */
18#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020020Py_LOCAL_INLINE(Py_UCS4)
21STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22 STRINGLIB_CHAR *dest,
23 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010024{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020025 Py_UCS4 ch;
26 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020027 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020028 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010029
30 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020031 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010032
33 if (ch < 0x80) {
34 /* Fast path for runs of ASCII characters. Given that common UTF-8
35 input will consist of an overwhelming majority of ASCII
36 characters, we try to optimize for this case by checking
37 as many characters as a C 'long' can contain.
38 First, check if we can do an aligned read, as most CPUs have
39 a penalty for unaligned reads.
40 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020041 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010042 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020043 const char *_s = s;
44 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010045 while (_s < aligned_end) {
46 /* Read a whole long at a time (either 4 or 8 bytes),
47 and do a fast unrolled copy if it only contains ASCII
48 characters. */
49 unsigned long value = *(unsigned long *) _s;
50 if (value & ASCII_CHAR_MASK)
51 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020052#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020053 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57# if SIZEOF_LONG == 8
58 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62# endif
63#else
64# if SIZEOF_LONG == 8
65 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73# else
74 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010079#endif
80 _s += SIZEOF_LONG;
81 _p += SIZEOF_LONG;
82 }
83 s = _s;
84 p = _p;
85 if (s == end)
86 break;
87 ch = (unsigned char)*s;
88 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020089 if (ch < 0x80) {
90 s++;
91 *p++ = ch;
92 continue;
93 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010094 }
95
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020096 if (ch < 0xE0) {
97 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010098 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020099 if (ch < 0xC2) {
100 /* invalid sequence
101 \x80-\xBF -- continuation byte
102 \xC0-\xC1 -- fake 0000-007F */
103 goto InvalidStart;
104 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200105 if (end - s < 2) {
106 /* unexpected end of data: the caller will decide whether
107 it's an error or not */
108 break;
109 }
110 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100111 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200113 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200114 ch = (ch << 6) + ch2 -
115 ((0xC0 << 6) + 0x80);
116 assert ((ch > 0x007F) && (ch <= 0x07FF));
117 s += 2;
118 if (STRINGLIB_MAX_CHAR <= 0x007F ||
119 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200120 /* Out-of-range */
121 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100122 *p++ = ch;
123 continue;
124 }
125
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200126 if (ch < 0xF0) {
127 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128 Py_UCS4 ch2, ch3;
129 if (end - s < 3) {
130 /* unexpected end of data: the caller will decide whether
131 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200132 if (end - s < 2)
133 break;
134 ch2 = (unsigned char)s[1];
135 if (!IS_CONTINUATION_BYTE(ch2) ||
136 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137 /* for clarification see comments below */
138 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200141 ch2 = (unsigned char)s[1];
142 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200145 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200146 }
147 if (ch == 0xE0) {
148 if (ch2 < 0xA0)
149 /* invalid sequence
150 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200151 goto InvalidContinuation1;
152 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200153 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154 will result in surrogates in range D800-DFFF. Surrogates are
155 not valid UTF-8 so they are rejected.
156 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200158 goto InvalidContinuation1;
159 }
160 if (!IS_CONTINUATION_BYTE(ch3)) {
161 /* invalid continuation byte */
162 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163 }
164 ch = (ch << 12) + (ch2 << 6) + ch3 -
165 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100166 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200168 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200170 /* Out-of-range */
171 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100174 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200175
176 if (ch < 0xF5) {
177 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178 Py_UCS4 ch2, ch3, ch4;
179 if (end - s < 4) {
180 /* unexpected end of data: the caller will decide whether
181 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200182 if (end - s < 2)
183 break;
184 ch2 = (unsigned char)s[1];
185 if (!IS_CONTINUATION_BYTE(ch2) ||
186 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187 /* for clarification see comments below */
188 goto InvalidContinuation1;
189 if (end - s < 3)
190 break;
191 ch3 = (unsigned char)s[2];
192 if (!IS_CONTINUATION_BYTE(ch3))
193 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200194 break;
195 }
196 ch2 = (unsigned char)s[1];
197 ch3 = (unsigned char)s[2];
198 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200201 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200202 }
203 if (ch == 0xF0) {
204 if (ch2 < 0x90)
205 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200206 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207 goto InvalidContinuation1;
208 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200209 /* invalid sequence
210 \xF4\x90\x80\80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200211 goto InvalidContinuation1;
212 }
213 if (!IS_CONTINUATION_BYTE(ch3)) {
214 /* invalid continuation byte */
215 goto InvalidContinuation2;
216 }
217 if (!IS_CONTINUATION_BYTE(ch4)) {
218 /* invalid continuation byte */
219 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200220 }
221 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224 s += 4;
225 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200227 /* Out-of-range */
228 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200229 *p++ = ch;
230 continue;
231 }
232 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200234 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200235Return:
236 *inptr = s;
237 *outpos = p - dest;
238 return ch;
239InvalidStart:
240 ch = 1;
241 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200242InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200243 ch = 2;
244 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200245InvalidContinuation2:
246 ch = 3;
247 goto Return;
248InvalidContinuation3:
249 ch = 4;
250 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251}
252
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100253#undef ASCII_CHAR_MASK
254
Victor Stinner6099a032011-12-18 14:22:26 +0100255
256/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258 UCS-1 strings don't need to handle surrogates for example. */
259Py_LOCAL_INLINE(PyObject *)
260STRINGLIB(utf8_encoder)(PyObject *unicode,
261 STRINGLIB_CHAR *data,
262 Py_ssize_t size,
263 const char *errors)
264{
265#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
266
267 Py_ssize_t i; /* index into s of next input byte */
Victor Stinner6099a032011-12-18 14:22:26 +0100268 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100269#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200270 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100271 PyObject *exc = NULL;
272 PyObject *rep = NULL;
Victor Stinner01ada392015-10-01 21:54:51 +0200273 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6099a032011-12-18 14:22:26 +0100274#endif
275#if STRINGLIB_SIZEOF_CHAR == 1
276 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#elif STRINGLIB_SIZEOF_CHAR == 2
278 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#else /* STRINGLIB_SIZEOF_CHAR == 4 */
280 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100281#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200282 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100283
284 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100286
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200287 if (size > PY_SSIZE_T_MAX / max_char_size) {
288 /* integer overflow */
289 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100290 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200291
292 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
293 if (p == NULL)
294 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100295
296 for (i = 0; i < size;) {
297 Py_UCS4 ch = data[i++];
298
299 if (ch < 0x80) {
300 /* Encode ASCII */
301 *p++ = (char) ch;
302
303 }
304 else
305#if STRINGLIB_SIZEOF_CHAR > 1
306 if (ch < 0x0800)
307#endif
308 {
309 /* Encode Latin-1 */
310 *p++ = (char)(0xc0 | (ch >> 6));
311 *p++ = (char)(0x80 | (ch & 0x3f));
312 }
313#if STRINGLIB_SIZEOF_CHAR > 1
314 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200315 Py_ssize_t startpos, endpos, newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +0200316 Py_ssize_t k;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200317 if (error_handler == _Py_ERROR_UNKNOWN) {
Victor Stinner01ada392015-10-01 21:54:51 +0200318 error_handler = get_error_handler(errors);
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 }
Victor Stinner01ada392015-10-01 21:54:51 +0200320
Victor Stinner6099a032011-12-18 14:22:26 +0100321 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200322 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100323
Victor Stinner01ada392015-10-01 21:54:51 +0200324 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
325 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100326
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200327 /* Only overallocate the buffer if it's not the last write */
328 writer.overallocate = (endpos < size);
329
Victor Stinner01ada392015-10-01 21:54:51 +0200330 switch (error_handler)
331 {
332 case _Py_ERROR_REPLACE:
333 memset(p, '?', endpos - startpos);
334 p += (endpos - startpos);
335 /* fall through the ignore handler */
336 case _Py_ERROR_IGNORE:
337 i += (endpos - startpos - 1);
338 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100339
Victor Stinner01ada392015-10-01 21:54:51 +0200340 case _Py_ERROR_SURROGATEPASS:
341 for (k=startpos; k<endpos; k++) {
342 ch = data[k];
343 *p++ = (char)(0xe0 | (ch >> 12));
344 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
345 *p++ = (char)(0x80 | (ch & 0x3f));
346 }
347 i += (endpos - startpos - 1);
348 break;
349
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200350 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700351 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200352 writer.min_size -= max_char_size * (endpos - startpos);
353 p = backslashreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200354 unicode, startpos, endpos);
355 if (p == NULL)
356 goto error;
357 i += (endpos - startpos - 1);
358 break;
359
360 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700361 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200362 writer.min_size -= max_char_size * (endpos - startpos);
363 p = xmlcharrefreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200364 unicode, startpos, endpos);
365 if (p == NULL)
366 goto error;
367 i += (endpos - startpos - 1);
368 break;
369
Victor Stinner01ada392015-10-01 21:54:51 +0200370 case _Py_ERROR_SURROGATEESCAPE:
371 for (k=startpos; k<endpos; k++) {
372 ch = data[k];
373 if (!(0xDC80 <= ch && ch <= 0xDCFF))
374 break;
375 *p++ = (char)(ch & 0xff);
376 }
377 if (k >= endpos) {
378 i += (endpos - startpos - 1);
379 break;
380 }
381 startpos = k;
382 assert(startpos < endpos);
383 /* fall through the default handler */
Victor Stinner01ada392015-10-01 21:54:51 +0200384 default:
385 rep = unicode_encode_call_errorhandler(
386 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
387 unicode, &exc, startpos, endpos, &newpos);
388 if (!rep)
389 goto error;
390
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700391 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200392 writer.min_size -= max_char_size;
393
Victor Stinner01ada392015-10-01 21:54:51 +0200394 if (PyBytes_Check(rep)) {
Victor Stinnerce179bf2015-10-09 12:57:22 +0200395 p = _PyBytesWriter_WriteBytes(&writer, p,
396 PyBytes_AS_STRING(rep),
397 PyBytes_GET_SIZE(rep));
Victor Stinner01ada392015-10-01 21:54:51 +0200398 }
399 else {
400 /* rep is unicode */
401 if (PyUnicode_READY(rep) < 0)
402 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100403
Victor Stinner01ada392015-10-01 21:54:51 +0200404 if (!PyUnicode_IS_ASCII(rep)) {
Victor Stinner6099a032011-12-18 14:22:26 +0100405 raise_encode_exception(&exc, "utf-8",
406 unicode,
407 i-1, i,
408 "surrogates not allowed");
409 goto error;
410 }
Victor Stinner01ada392015-10-01 21:54:51 +0200411
412 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Victor Stinner6bd525b2015-10-09 13:10:05 +0200413 p = _PyBytesWriter_WriteBytes(&writer, p,
414 PyUnicode_DATA(rep),
415 PyUnicode_GET_LENGTH(rep));
Victor Stinner6099a032011-12-18 14:22:26 +0100416 }
Victor Stinner6bd525b2015-10-09 13:10:05 +0200417
418 if (p == NULL)
419 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200420 Py_CLEAR(rep);
421
422 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100423 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200424
425 /* If overallocation was disabled, ensure that it was the last
426 write. Otherwise, we missed an optimization */
427 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100428 }
429 else
430#if STRINGLIB_SIZEOF_CHAR > 2
431 if (ch < 0x10000)
432#endif
433 {
434 *p++ = (char)(0xe0 | (ch >> 12));
435 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
436 *p++ = (char)(0x80 | (ch & 0x3f));
437 }
438#if STRINGLIB_SIZEOF_CHAR > 2
439 else /* ch >= 0x10000 */
440 {
441 assert(ch <= MAX_UNICODE);
442 /* Encode UCS4 Unicode ordinals */
443 *p++ = (char)(0xf0 | (ch >> 18));
444 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
445 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
446 *p++ = (char)(0x80 | (ch & 0x3f));
447 }
448#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
449#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
450 }
451
Victor Stinner6099a032011-12-18 14:22:26 +0100452#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200453 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100454 Py_XDECREF(exc);
455#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200456 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100457
458#if STRINGLIB_SIZEOF_CHAR > 1
459 error:
460 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200461 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100462 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200463 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100464 return NULL;
465#endif
466
467#undef MAX_SHORT_UNICHARS
468}
469
Antoine Pitrou63065d72012-05-15 23:48:04 +0200470/* The pattern for constructing UCS2-repeated masks. */
471#if SIZEOF_LONG == 8
472# define UCS2_REPEAT_MASK 0x0001000100010001ul
473#elif SIZEOF_LONG == 4
474# define UCS2_REPEAT_MASK 0x00010001ul
475#else
476# error C 'long' size should be either 4 or 8!
477#endif
478
479/* The mask for fast checking. */
480#if STRINGLIB_SIZEOF_CHAR == 1
481/* The mask for fast checking of whether a C 'long' contains a
482 non-ASCII or non-Latin1 UTF16-encoded characters. */
483# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
484#else
485/* The mask for fast checking of whether a C 'long' may contain
486 UTF16-encoded surrogate characters. This is an efficient heuristic,
487 assuming that non-surrogate characters with a code point >= 0x8000 are
488 rare in most input.
489*/
490# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
491#endif
492/* The mask for fast byte-swapping. */
493#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
494/* Swap bytes. */
495#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
496 (((value) & STRIPPED_MASK) << 8))
497
498Py_LOCAL_INLINE(Py_UCS4)
499STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
500 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
501 int native_ordering)
502{
503 Py_UCS4 ch;
504 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200505 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200506 const unsigned char *q = *inptr;
507 STRINGLIB_CHAR *p = dest + *outpos;
508 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200509#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200510 int ihi = !!native_ordering, ilo = !native_ordering;
511#else
512 int ihi = !native_ordering, ilo = !!native_ordering;
513#endif
514 --e;
515
516 while (q < e) {
517 Py_UCS4 ch2;
518 /* First check for possible aligned read of a C 'long'. Unaligned
519 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200520 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200521 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200522 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200523 while (_q < aligned_end) {
524 unsigned long block = * (unsigned long *) _q;
525 if (native_ordering) {
526 /* Can use buffer directly */
527 if (block & FAST_CHAR_MASK)
528 break;
529 }
530 else {
531 /* Need to byte-swap */
532 if (block & SWAB(FAST_CHAR_MASK))
533 break;
534#if STRINGLIB_SIZEOF_CHAR == 1
535 block >>= 8;
536#else
537 block = SWAB(block);
538#endif
539 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200540#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200541# if SIZEOF_LONG == 4
542 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
543 p[1] = (STRINGLIB_CHAR)(block >> 16);
544# elif SIZEOF_LONG == 8
545 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
546 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
547 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
548 p[3] = (STRINGLIB_CHAR)(block >> 48);
549# endif
550#else
551# if SIZEOF_LONG == 4
552 p[0] = (STRINGLIB_CHAR)(block >> 16);
553 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
554# elif SIZEOF_LONG == 8
555 p[0] = (STRINGLIB_CHAR)(block >> 48);
556 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
557 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
558 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
559# endif
560#endif
561 _q += SIZEOF_LONG;
562 p += SIZEOF_LONG / 2;
563 }
564 q = _q;
565 if (q >= e)
566 break;
567 }
568
569 ch = (q[ihi] << 8) | q[ilo];
570 q += 2;
571 if (!Py_UNICODE_IS_SURROGATE(ch)) {
572#if STRINGLIB_SIZEOF_CHAR < 2
573 if (ch > STRINGLIB_MAX_CHAR)
574 /* Out-of-range */
575 goto Return;
576#endif
577 *p++ = (STRINGLIB_CHAR)ch;
578 continue;
579 }
580
581 /* UTF-16 code pair: */
582 if (q >= e)
583 goto UnexpectedEnd;
584 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
585 goto IllegalEncoding;
586 ch2 = (q[ihi] << 8) | q[ilo];
587 q += 2;
588 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
589 goto IllegalSurrogate;
590 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
591#if STRINGLIB_SIZEOF_CHAR < 4
592 /* Out-of-range */
593 goto Return;
594#else
595 *p++ = (STRINGLIB_CHAR)ch;
596#endif
597 }
598 ch = 0;
599Return:
600 *inptr = q;
601 *outpos = p - dest;
602 return ch;
603UnexpectedEnd:
604 ch = 1;
605 goto Return;
606IllegalEncoding:
607 ch = 2;
608 goto Return;
609IllegalSurrogate:
610 ch = 3;
611 goto Return;
612}
613#undef UCS2_REPEAT_MASK
614#undef FAST_CHAR_MASK
615#undef STRIPPED_MASK
616#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200617
618
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200619#if STRINGLIB_MAX_CHAR >= 0x80
620Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200621STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
622 Py_ssize_t len,
623 unsigned short **outptr,
624 int native_ordering)
625{
626 unsigned short *out = *outptr;
627 const STRINGLIB_CHAR *end = in + len;
628#if STRINGLIB_SIZEOF_CHAR == 1
629 if (native_ordering) {
630 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
631 while (in < unrolled_end) {
632 out[0] = in[0];
633 out[1] = in[1];
634 out[2] = in[2];
635 out[3] = in[3];
636 in += 4; out += 4;
637 }
638 while (in < end) {
639 *out++ = *in++;
640 }
641 } else {
642# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200643 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200644 while (in < unrolled_end) {
645 out[0] = SWAB2(in[0]);
646 out[1] = SWAB2(in[1]);
647 out[2] = SWAB2(in[2]);
648 out[3] = SWAB2(in[3]);
649 in += 4; out += 4;
650 }
651 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200652 Py_UCS4 ch = *in++;
653 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200654 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200655#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200656 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200657 *outptr = out;
658 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200659#else
660 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200661#if STRINGLIB_MAX_CHAR < 0x10000
662 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
663 while (in < unrolled_end) {
664 /* check if any character is a surrogate character */
665 if (((in[0] ^ 0xd800) &
666 (in[1] ^ 0xd800) &
667 (in[2] ^ 0xd800) &
668 (in[3] ^ 0xd800) & 0xf800) == 0)
669 break;
670 out[0] = in[0];
671 out[1] = in[1];
672 out[2] = in[2];
673 out[3] = in[3];
674 in += 4; out += 4;
675 }
676#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200677 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200678 Py_UCS4 ch;
679 ch = *in++;
680 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200681 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200682 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300683 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200684 goto fail;
685#if STRINGLIB_MAX_CHAR >= 0x10000
686 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200687 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
688 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
689 out += 2;
690 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200691#endif
692 else
693 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200694 }
695 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200696#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
697#if STRINGLIB_MAX_CHAR < 0x10000
698 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
699 while (in < unrolled_end) {
700 /* check if any character is a surrogate character */
701 if (((in[0] ^ 0xd800) &
702 (in[1] ^ 0xd800) &
703 (in[2] ^ 0xd800) &
704 (in[3] ^ 0xd800) & 0xf800) == 0)
705 break;
706 out[0] = SWAB2(in[0]);
707 out[1] = SWAB2(in[1]);
708 out[2] = SWAB2(in[2]);
709 out[3] = SWAB2(in[3]);
710 in += 4; out += 4;
711 }
712#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200713 while (in < end) {
714 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200715 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200716 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200717 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300718 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200719 goto fail;
720#if STRINGLIB_MAX_CHAR >= 0x10000
721 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200722 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
723 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
724 out[0] = SWAB2(ch1);
725 out[1] = SWAB2(ch2);
726 out += 2;
727 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200728#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200729 else
730 *out++ = SWAB2((Py_UCS2)ch);
731 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200732#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200733 }
734 *outptr = out;
735 return len;
736 fail:
737 *outptr = out;
738 return len - (end - in + 1);
739#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200740}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300741
742#if STRINGLIB_SIZEOF_CHAR == 1
743# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
744#elif STRINGLIB_SIZEOF_CHAR == 2
745# define SWAB4(CH, tmp) (tmp = (CH), \
746 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
747 /* high bytes are zero */
748#else
749# define SWAB4(CH, tmp) (tmp = (CH), \
750 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
751 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
752#endif
753Py_LOCAL_INLINE(Py_ssize_t)
754STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
755 Py_ssize_t len,
756 PY_UINT32_T **outptr,
757 int native_ordering)
758{
759 PY_UINT32_T *out = *outptr;
760 const STRINGLIB_CHAR *end = in + len;
761 if (native_ordering) {
762 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
763 while (in < unrolled_end) {
764#if STRINGLIB_SIZEOF_CHAR > 1
765 /* check if any character is a surrogate character */
766 if (((in[0] ^ 0xd800) &
767 (in[1] ^ 0xd800) &
768 (in[2] ^ 0xd800) &
769 (in[3] ^ 0xd800) & 0xf800) == 0)
770 break;
771#endif
772 out[0] = in[0];
773 out[1] = in[1];
774 out[2] = in[2];
775 out[3] = in[3];
776 in += 4; out += 4;
777 }
778 while (in < end) {
779 Py_UCS4 ch;
780 ch = *in++;
781#if STRINGLIB_SIZEOF_CHAR > 1
782 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300783 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300784 goto fail;
785 }
786#endif
787 *out++ = ch;
788 }
789 } else {
790 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
791 while (in < unrolled_end) {
792#if STRINGLIB_SIZEOF_CHAR > 1
793 Py_UCS4 ch1, ch2, ch3, ch4;
794 /* check if any character is a surrogate character */
795 if (((in[0] ^ 0xd800) &
796 (in[1] ^ 0xd800) &
797 (in[2] ^ 0xd800) &
798 (in[3] ^ 0xd800) & 0xf800) == 0)
799 break;
800#endif
801 out[0] = SWAB4(in[0], ch1);
802 out[1] = SWAB4(in[1], ch2);
803 out[2] = SWAB4(in[2], ch3);
804 out[3] = SWAB4(in[3], ch4);
805 in += 4; out += 4;
806 }
807 while (in < end) {
808 Py_UCS4 ch = *in++;
809#if STRINGLIB_SIZEOF_CHAR > 1
810 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300811 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300812 goto fail;
813 }
814#endif
815 *out++ = SWAB4(ch, ch);
816 }
817 }
818 *outptr = out;
819 return len;
820#if STRINGLIB_SIZEOF_CHAR > 1
821 fail:
822 *outptr = out;
823 return len - (end - in + 1);
824#endif
825}
826#undef SWAB4
827
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200828#endif