blob: 269a5581f70055261bbdf753a85a33fa4a9e85c4 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
Serhiy Storchakabcde10a2016-05-16 09:42:29 +03003#if !STRINGLIB_IS_UNICODE
4# error "codecs.h is specific to Unicode"
5#endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01006
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01007/* Mask to quickly check whether a C 'long' contains a
8 non-ASCII, UTF8-encoded char. */
9#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020012# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010013#else
14# error C 'long' size should be either 4 or 8!
15#endif
16
Mark Dickinson106c4142012-06-23 21:45:14 +010017/* 10xxxxxx */
18#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020020Py_LOCAL_INLINE(Py_UCS4)
21STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22 STRINGLIB_CHAR *dest,
23 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010024{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020025 Py_UCS4 ch;
26 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020027 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020028 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010029
30 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020031 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010032
33 if (ch < 0x80) {
34 /* Fast path for runs of ASCII characters. Given that common UTF-8
35 input will consist of an overwhelming majority of ASCII
36 characters, we try to optimize for this case by checking
37 as many characters as a C 'long' can contain.
38 First, check if we can do an aligned read, as most CPUs have
39 a penalty for unaligned reads.
40 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020041 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010042 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020043 const char *_s = s;
44 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010045 while (_s < aligned_end) {
46 /* Read a whole long at a time (either 4 or 8 bytes),
47 and do a fast unrolled copy if it only contains ASCII
48 characters. */
Andy Lestere6be9b52020-02-11 20:28:35 -060049 unsigned long value = *(const unsigned long *) _s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010050 if (value & ASCII_CHAR_MASK)
51 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020052#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020053 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57# if SIZEOF_LONG == 8
58 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62# endif
63#else
64# if SIZEOF_LONG == 8
65 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73# else
74 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010079#endif
80 _s += SIZEOF_LONG;
81 _p += SIZEOF_LONG;
82 }
83 s = _s;
84 p = _p;
85 if (s == end)
86 break;
87 ch = (unsigned char)*s;
88 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020089 if (ch < 0x80) {
90 s++;
91 *p++ = ch;
92 continue;
93 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010094 }
95
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020096 if (ch < 0xE0) {
97 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010098 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020099 if (ch < 0xC2) {
100 /* invalid sequence
101 \x80-\xBF -- continuation byte
102 \xC0-\xC1 -- fake 0000-007F */
103 goto InvalidStart;
104 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200105 if (end - s < 2) {
106 /* unexpected end of data: the caller will decide whether
107 it's an error or not */
108 break;
109 }
110 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100111 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200113 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200114 ch = (ch << 6) + ch2 -
115 ((0xC0 << 6) + 0x80);
116 assert ((ch > 0x007F) && (ch <= 0x07FF));
117 s += 2;
118 if (STRINGLIB_MAX_CHAR <= 0x007F ||
119 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200120 /* Out-of-range */
121 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100122 *p++ = ch;
123 continue;
124 }
125
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200126 if (ch < 0xF0) {
127 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128 Py_UCS4 ch2, ch3;
129 if (end - s < 3) {
130 /* unexpected end of data: the caller will decide whether
131 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200132 if (end - s < 2)
133 break;
134 ch2 = (unsigned char)s[1];
135 if (!IS_CONTINUATION_BYTE(ch2) ||
136 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137 /* for clarification see comments below */
138 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200141 ch2 = (unsigned char)s[1];
142 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200145 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200146 }
147 if (ch == 0xE0) {
148 if (ch2 < 0xA0)
149 /* invalid sequence
150 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200151 goto InvalidContinuation1;
152 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200153 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154 will result in surrogates in range D800-DFFF. Surrogates are
155 not valid UTF-8 so they are rejected.
156 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200158 goto InvalidContinuation1;
159 }
160 if (!IS_CONTINUATION_BYTE(ch3)) {
161 /* invalid continuation byte */
162 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163 }
164 ch = (ch << 12) + (ch2 << 6) + ch3 -
165 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100166 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200168 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200170 /* Out-of-range */
171 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100174 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200175
176 if (ch < 0xF5) {
177 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178 Py_UCS4 ch2, ch3, ch4;
179 if (end - s < 4) {
180 /* unexpected end of data: the caller will decide whether
181 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200182 if (end - s < 2)
183 break;
184 ch2 = (unsigned char)s[1];
185 if (!IS_CONTINUATION_BYTE(ch2) ||
186 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187 /* for clarification see comments below */
188 goto InvalidContinuation1;
189 if (end - s < 3)
190 break;
191 ch3 = (unsigned char)s[2];
192 if (!IS_CONTINUATION_BYTE(ch3))
193 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200194 break;
195 }
196 ch2 = (unsigned char)s[1];
197 ch3 = (unsigned char)s[2];
198 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200201 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200202 }
203 if (ch == 0xF0) {
204 if (ch2 < 0x90)
205 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200206 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207 goto InvalidContinuation1;
208 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200209 /* invalid sequence
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300210 \xF4\x90\x80\x80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200211 goto InvalidContinuation1;
212 }
213 if (!IS_CONTINUATION_BYTE(ch3)) {
214 /* invalid continuation byte */
215 goto InvalidContinuation2;
216 }
217 if (!IS_CONTINUATION_BYTE(ch4)) {
218 /* invalid continuation byte */
219 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200220 }
221 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224 s += 4;
225 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200227 /* Out-of-range */
228 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200229 *p++ = ch;
230 continue;
231 }
232 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200234 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200235Return:
236 *inptr = s;
237 *outpos = p - dest;
238 return ch;
239InvalidStart:
240 ch = 1;
241 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200242InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200243 ch = 2;
244 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200245InvalidContinuation2:
246 ch = 3;
247 goto Return;
248InvalidContinuation3:
249 ch = 4;
250 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251}
252
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100253#undef ASCII_CHAR_MASK
254
Victor Stinner6099a032011-12-18 14:22:26 +0100255
256/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258 UCS-1 strings don't need to handle surrogates for example. */
259Py_LOCAL_INLINE(PyObject *)
260STRINGLIB(utf8_encoder)(PyObject *unicode,
261 STRINGLIB_CHAR *data,
262 Py_ssize_t size,
Victor Stinner709d23d2019-05-02 14:56:30 -0400263 _Py_error_handler error_handler,
Victor Stinner6099a032011-12-18 14:22:26 +0100264 const char *errors)
265{
Serhiy Storchaka998c9cd2016-10-30 18:25:27 +0200266 Py_ssize_t i; /* index into data of next input character */
Victor Stinner6099a032011-12-18 14:22:26 +0100267 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100268#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200269 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100270 PyObject *exc = NULL;
271 PyObject *rep = NULL;
272#endif
273#if STRINGLIB_SIZEOF_CHAR == 1
274 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100275#elif STRINGLIB_SIZEOF_CHAR == 2
276 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100277#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100279#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200280 _PyBytesWriter writer;
Victor Stinner6099a032011-12-18 14:22:26 +0100281
282 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200283 _PyBytesWriter_Init(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100284
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200285 if (size > PY_SSIZE_T_MAX / max_char_size) {
286 /* integer overflow */
287 return PyErr_NoMemory();
Victor Stinner6099a032011-12-18 14:22:26 +0100288 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200289
290 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291 if (p == NULL)
292 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303#if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305#endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311#if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200313 Py_ssize_t startpos, endpos, newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +0200314 Py_ssize_t k;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200315 if (error_handler == _Py_ERROR_UNKNOWN) {
Victor Stinner3d4226a2018-08-29 22:21:32 +0200316 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200317 }
Victor Stinner01ada392015-10-01 21:54:51 +0200318
Victor Stinner6099a032011-12-18 14:22:26 +0100319 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200320 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100321
Victor Stinner01ada392015-10-01 21:54:51 +0200322 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
323 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100324
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200325 /* Only overallocate the buffer if it's not the last write */
326 writer.overallocate = (endpos < size);
327
Victor Stinner01ada392015-10-01 21:54:51 +0200328 switch (error_handler)
329 {
330 case _Py_ERROR_REPLACE:
331 memset(p, '?', endpos - startpos);
332 p += (endpos - startpos);
Stefan Krahf432a322017-08-21 13:09:59 +0200333 /* fall through */
Victor Stinner01ada392015-10-01 21:54:51 +0200334 case _Py_ERROR_IGNORE:
335 i += (endpos - startpos - 1);
336 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100337
Victor Stinner01ada392015-10-01 21:54:51 +0200338 case _Py_ERROR_SURROGATEPASS:
339 for (k=startpos; k<endpos; k++) {
340 ch = data[k];
341 *p++ = (char)(0xe0 | (ch >> 12));
342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
343 *p++ = (char)(0x80 | (ch & 0x3f));
344 }
345 i += (endpos - startpos - 1);
346 break;
347
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700349 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200350 writer.min_size -= max_char_size * (endpos - startpos);
351 p = backslashreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 unicode, startpos, endpos);
353 if (p == NULL)
354 goto error;
355 i += (endpos - startpos - 1);
356 break;
357
358 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700359 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +0200360 writer.min_size -= max_char_size * (endpos - startpos);
361 p = xmlcharrefreplace(&writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200362 unicode, startpos, endpos);
363 if (p == NULL)
364 goto error;
365 i += (endpos - startpos - 1);
366 break;
367
Victor Stinner01ada392015-10-01 21:54:51 +0200368 case _Py_ERROR_SURROGATEESCAPE:
369 for (k=startpos; k<endpos; k++) {
370 ch = data[k];
371 if (!(0xDC80 <= ch && ch <= 0xDCFF))
372 break;
373 *p++ = (char)(ch & 0xff);
374 }
375 if (k >= endpos) {
376 i += (endpos - startpos - 1);
377 break;
378 }
379 startpos = k;
380 assert(startpos < endpos);
Stefan Krahf432a322017-08-21 13:09:59 +0200381 /* fall through */
Victor Stinner01ada392015-10-01 21:54:51 +0200382 default:
383 rep = unicode_encode_call_errorhandler(
384 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
385 unicode, &exc, startpos, endpos, &newpos);
386 if (!rep)
387 goto error;
388
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700389 /* subtract preallocated bytes */
Serhiy Storchaka998c9cd2016-10-30 18:25:27 +0200390 writer.min_size -= max_char_size * (newpos - startpos);
Victor Stinnerad771582015-10-09 12:38:53 +0200391
Victor Stinner01ada392015-10-01 21:54:51 +0200392 if (PyBytes_Check(rep)) {
Victor Stinnerce179bf2015-10-09 12:57:22 +0200393 p = _PyBytesWriter_WriteBytes(&writer, p,
394 PyBytes_AS_STRING(rep),
395 PyBytes_GET_SIZE(rep));
Victor Stinner01ada392015-10-01 21:54:51 +0200396 }
397 else {
398 /* rep is unicode */
399 if (PyUnicode_READY(rep) < 0)
400 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100401
Victor Stinner01ada392015-10-01 21:54:51 +0200402 if (!PyUnicode_IS_ASCII(rep)) {
Serhiy Storchaka998c9cd2016-10-30 18:25:27 +0200403 raise_encode_exception(&exc, "utf-8", unicode,
404 startpos, endpos,
Victor Stinner6099a032011-12-18 14:22:26 +0100405 "surrogates not allowed");
406 goto error;
407 }
Victor Stinner01ada392015-10-01 21:54:51 +0200408
Victor Stinner6bd525b2015-10-09 13:10:05 +0200409 p = _PyBytesWriter_WriteBytes(&writer, p,
410 PyUnicode_DATA(rep),
411 PyUnicode_GET_LENGTH(rep));
Victor Stinner6099a032011-12-18 14:22:26 +0100412 }
Victor Stinner6bd525b2015-10-09 13:10:05 +0200413
414 if (p == NULL)
415 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200416 Py_CLEAR(rep);
417
418 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100419 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200420
421 /* If overallocation was disabled, ensure that it was the last
422 write. Otherwise, we missed an optimization */
423 assert(writer.overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100424 }
425 else
426#if STRINGLIB_SIZEOF_CHAR > 2
427 if (ch < 0x10000)
428#endif
429 {
430 *p++ = (char)(0xe0 | (ch >> 12));
431 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
432 *p++ = (char)(0x80 | (ch & 0x3f));
433 }
434#if STRINGLIB_SIZEOF_CHAR > 2
435 else /* ch >= 0x10000 */
436 {
437 assert(ch <= MAX_UNICODE);
438 /* Encode UCS4 Unicode ordinals */
439 *p++ = (char)(0xf0 | (ch >> 18));
440 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
441 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
442 *p++ = (char)(0x80 | (ch & 0x3f));
443 }
444#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
445#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
446 }
447
Victor Stinner6099a032011-12-18 14:22:26 +0100448#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200449 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100450 Py_XDECREF(exc);
451#endif
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200452 return _PyBytesWriter_Finish(&writer, p);
Victor Stinner6099a032011-12-18 14:22:26 +0100453
454#if STRINGLIB_SIZEOF_CHAR > 1
455 error:
456 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200457 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100458 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200459 _PyBytesWriter_Dealloc(&writer);
Victor Stinner6099a032011-12-18 14:22:26 +0100460 return NULL;
461#endif
Victor Stinner6099a032011-12-18 14:22:26 +0100462}
463
Antoine Pitrou63065d72012-05-15 23:48:04 +0200464/* The pattern for constructing UCS2-repeated masks. */
465#if SIZEOF_LONG == 8
466# define UCS2_REPEAT_MASK 0x0001000100010001ul
467#elif SIZEOF_LONG == 4
468# define UCS2_REPEAT_MASK 0x00010001ul
469#else
470# error C 'long' size should be either 4 or 8!
471#endif
472
473/* The mask for fast checking. */
474#if STRINGLIB_SIZEOF_CHAR == 1
475/* The mask for fast checking of whether a C 'long' contains a
476 non-ASCII or non-Latin1 UTF16-encoded characters. */
477# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
478#else
479/* The mask for fast checking of whether a C 'long' may contain
480 UTF16-encoded surrogate characters. This is an efficient heuristic,
481 assuming that non-surrogate characters with a code point >= 0x8000 are
482 rare in most input.
483*/
484# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
485#endif
486/* The mask for fast byte-swapping. */
487#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
488/* Swap bytes. */
489#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
490 (((value) & STRIPPED_MASK) << 8))
491
492Py_LOCAL_INLINE(Py_UCS4)
493STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
494 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
495 int native_ordering)
496{
497 Py_UCS4 ch;
498 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200499 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200500 const unsigned char *q = *inptr;
501 STRINGLIB_CHAR *p = dest + *outpos;
502 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200503#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200504 int ihi = !!native_ordering, ilo = !native_ordering;
505#else
506 int ihi = !native_ordering, ilo = !!native_ordering;
507#endif
508 --e;
509
510 while (q < e) {
511 Py_UCS4 ch2;
512 /* First check for possible aligned read of a C 'long'. Unaligned
513 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200514 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200515 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200516 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200517 while (_q < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -0600518 unsigned long block = * (const unsigned long *) _q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200519 if (native_ordering) {
520 /* Can use buffer directly */
521 if (block & FAST_CHAR_MASK)
522 break;
523 }
524 else {
525 /* Need to byte-swap */
526 if (block & SWAB(FAST_CHAR_MASK))
527 break;
528#if STRINGLIB_SIZEOF_CHAR == 1
529 block >>= 8;
530#else
531 block = SWAB(block);
532#endif
533 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200534#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200535# if SIZEOF_LONG == 4
536 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
537 p[1] = (STRINGLIB_CHAR)(block >> 16);
538# elif SIZEOF_LONG == 8
539 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
540 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
541 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
542 p[3] = (STRINGLIB_CHAR)(block >> 48);
543# endif
544#else
545# if SIZEOF_LONG == 4
546 p[0] = (STRINGLIB_CHAR)(block >> 16);
547 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
548# elif SIZEOF_LONG == 8
549 p[0] = (STRINGLIB_CHAR)(block >> 48);
550 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
551 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
552 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
553# endif
554#endif
555 _q += SIZEOF_LONG;
556 p += SIZEOF_LONG / 2;
557 }
558 q = _q;
559 if (q >= e)
560 break;
561 }
562
563 ch = (q[ihi] << 8) | q[ilo];
564 q += 2;
565 if (!Py_UNICODE_IS_SURROGATE(ch)) {
566#if STRINGLIB_SIZEOF_CHAR < 2
567 if (ch > STRINGLIB_MAX_CHAR)
568 /* Out-of-range */
569 goto Return;
570#endif
571 *p++ = (STRINGLIB_CHAR)ch;
572 continue;
573 }
574
575 /* UTF-16 code pair: */
Antoine Pitrou63065d72012-05-15 23:48:04 +0200576 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
577 goto IllegalEncoding;
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300578 if (q >= e)
579 goto UnexpectedEnd;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200580 ch2 = (q[ihi] << 8) | q[ilo];
581 q += 2;
582 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
583 goto IllegalSurrogate;
584 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
585#if STRINGLIB_SIZEOF_CHAR < 4
586 /* Out-of-range */
587 goto Return;
588#else
589 *p++ = (STRINGLIB_CHAR)ch;
590#endif
591 }
592 ch = 0;
593Return:
594 *inptr = q;
595 *outpos = p - dest;
596 return ch;
597UnexpectedEnd:
598 ch = 1;
599 goto Return;
600IllegalEncoding:
601 ch = 2;
602 goto Return;
603IllegalSurrogate:
604 ch = 3;
605 goto Return;
606}
607#undef UCS2_REPEAT_MASK
608#undef FAST_CHAR_MASK
609#undef STRIPPED_MASK
610#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200611
612
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200613#if STRINGLIB_MAX_CHAR >= 0x80
614Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200615STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
616 Py_ssize_t len,
617 unsigned short **outptr,
618 int native_ordering)
619{
620 unsigned short *out = *outptr;
621 const STRINGLIB_CHAR *end = in + len;
622#if STRINGLIB_SIZEOF_CHAR == 1
623 if (native_ordering) {
624 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
625 while (in < unrolled_end) {
626 out[0] = in[0];
627 out[1] = in[1];
628 out[2] = in[2];
629 out[3] = in[3];
630 in += 4; out += 4;
631 }
632 while (in < end) {
633 *out++ = *in++;
634 }
635 } else {
636# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200637 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200638 while (in < unrolled_end) {
639 out[0] = SWAB2(in[0]);
640 out[1] = SWAB2(in[1]);
641 out[2] = SWAB2(in[2]);
642 out[3] = SWAB2(in[3]);
643 in += 4; out += 4;
644 }
645 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200646 Py_UCS4 ch = *in++;
647 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200648 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200649#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200650 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200651 *outptr = out;
652 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200653#else
654 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200655#if STRINGLIB_MAX_CHAR < 0x10000
656 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
657 while (in < unrolled_end) {
658 /* check if any character is a surrogate character */
659 if (((in[0] ^ 0xd800) &
660 (in[1] ^ 0xd800) &
661 (in[2] ^ 0xd800) &
662 (in[3] ^ 0xd800) & 0xf800) == 0)
663 break;
664 out[0] = in[0];
665 out[1] = in[1];
666 out[2] = in[2];
667 out[3] = in[3];
668 in += 4; out += 4;
669 }
670#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200671 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200672 Py_UCS4 ch;
673 ch = *in++;
674 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200675 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200676 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300677 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200678 goto fail;
679#if STRINGLIB_MAX_CHAR >= 0x10000
680 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200681 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
682 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
683 out += 2;
684 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200685#endif
686 else
687 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200688 }
689 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200690#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
691#if STRINGLIB_MAX_CHAR < 0x10000
692 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
693 while (in < unrolled_end) {
694 /* check if any character is a surrogate character */
695 if (((in[0] ^ 0xd800) &
696 (in[1] ^ 0xd800) &
697 (in[2] ^ 0xd800) &
698 (in[3] ^ 0xd800) & 0xf800) == 0)
699 break;
700 out[0] = SWAB2(in[0]);
701 out[1] = SWAB2(in[1]);
702 out[2] = SWAB2(in[2]);
703 out[3] = SWAB2(in[3]);
704 in += 4; out += 4;
705 }
706#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200707 while (in < end) {
708 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200709 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200710 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200711 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300712 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200713 goto fail;
714#if STRINGLIB_MAX_CHAR >= 0x10000
715 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200716 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
717 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
718 out[0] = SWAB2(ch1);
719 out[1] = SWAB2(ch2);
720 out += 2;
721 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200722#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200723 else
724 *out++ = SWAB2((Py_UCS2)ch);
725 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200726#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200727 }
728 *outptr = out;
729 return len;
730 fail:
731 *outptr = out;
732 return len - (end - in + 1);
733#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200734}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300735
736#if STRINGLIB_SIZEOF_CHAR == 1
737# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
738#elif STRINGLIB_SIZEOF_CHAR == 2
739# define SWAB4(CH, tmp) (tmp = (CH), \
740 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
741 /* high bytes are zero */
742#else
743# define SWAB4(CH, tmp) (tmp = (CH), \
744 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
745 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
746#endif
747Py_LOCAL_INLINE(Py_ssize_t)
748STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
749 Py_ssize_t len,
750 PY_UINT32_T **outptr,
751 int native_ordering)
752{
753 PY_UINT32_T *out = *outptr;
754 const STRINGLIB_CHAR *end = in + len;
755 if (native_ordering) {
756 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
757 while (in < unrolled_end) {
758#if STRINGLIB_SIZEOF_CHAR > 1
759 /* check if any character is a surrogate character */
760 if (((in[0] ^ 0xd800) &
761 (in[1] ^ 0xd800) &
762 (in[2] ^ 0xd800) &
763 (in[3] ^ 0xd800) & 0xf800) == 0)
764 break;
765#endif
766 out[0] = in[0];
767 out[1] = in[1];
768 out[2] = in[2];
769 out[3] = in[3];
770 in += 4; out += 4;
771 }
772 while (in < end) {
773 Py_UCS4 ch;
774 ch = *in++;
775#if STRINGLIB_SIZEOF_CHAR > 1
776 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300777 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300778 goto fail;
779 }
780#endif
781 *out++ = ch;
782 }
783 } else {
784 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
785 while (in < unrolled_end) {
786#if STRINGLIB_SIZEOF_CHAR > 1
787 Py_UCS4 ch1, ch2, ch3, ch4;
788 /* check if any character is a surrogate character */
789 if (((in[0] ^ 0xd800) &
790 (in[1] ^ 0xd800) &
791 (in[2] ^ 0xd800) &
792 (in[3] ^ 0xd800) & 0xf800) == 0)
793 break;
794#endif
795 out[0] = SWAB4(in[0], ch1);
796 out[1] = SWAB4(in[1], ch2);
797 out[2] = SWAB4(in[2], ch3);
798 out[3] = SWAB4(in[3], ch4);
799 in += 4; out += 4;
800 }
801 while (in < end) {
802 Py_UCS4 ch = *in++;
803#if STRINGLIB_SIZEOF_CHAR > 1
804 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300805 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300806 goto fail;
807 }
808#endif
809 *out++ = SWAB4(ch, ch);
810 }
811 }
812 *outptr = out;
813 return len;
814#if STRINGLIB_SIZEOF_CHAR > 1
815 fail:
816 *outptr = out;
817 return len - (end - in + 1);
818#endif
819}
820#undef SWAB4
821
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822#endif