blob: 39c155321e47255f6462d695271c84d35f314108 [file] [log] [blame]
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01001/* stringlib: codec implementations */
2
Serhiy Storchakabcde10a2016-05-16 09:42:29 +03003#if !STRINGLIB_IS_UNICODE
4# error "codecs.h is specific to Unicode"
5#endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01006
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01007/* Mask to quickly check whether a C 'long' contains a
8 non-ASCII, UTF8-encoded char. */
9#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020010# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010011#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +020012# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010013#else
14# error C 'long' size should be either 4 or 8!
15#endif
16
Mark Dickinson106c4142012-06-23 21:45:14 +010017/* 10xxxxxx */
18#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020020Py_LOCAL_INLINE(Py_UCS4)
21STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22 STRINGLIB_CHAR *dest,
23 Py_ssize_t *outpos)
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010024{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020025 Py_UCS4 ch;
26 const char *s = *inptr;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020027 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020028 STRINGLIB_CHAR *p = dest + *outpos;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010029
30 while (s < end) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020031 ch = (unsigned char)*s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010032
33 if (ch < 0x80) {
34 /* Fast path for runs of ASCII characters. Given that common UTF-8
35 input will consist of an overwhelming majority of ASCII
36 characters, we try to optimize for this case by checking
37 as many characters as a C 'long' can contain.
38 First, check if we can do an aligned read, as most CPUs have
39 a penalty for unaligned reads.
40 */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +020041 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010042 /* Help register allocation */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020043 const char *_s = s;
44 STRINGLIB_CHAR *_p = p;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010045 while (_s < aligned_end) {
46 /* Read a whole long at a time (either 4 or 8 bytes),
47 and do a fast unrolled copy if it only contains ASCII
48 characters. */
Andy Lestere6be9b52020-02-11 20:28:35 -060049 unsigned long value = *(const unsigned long *) _s;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010050 if (value & ASCII_CHAR_MASK)
51 break;
Christian Heimes743e0cd2012-10-17 23:52:17 +020052#if PY_LITTLE_ENDIAN
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020053 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57# if SIZEOF_LONG == 8
58 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62# endif
63#else
64# if SIZEOF_LONG == 8
65 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73# else
74 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78# endif
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010079#endif
80 _s += SIZEOF_LONG;
81 _p += SIZEOF_LONG;
82 }
83 s = _s;
84 p = _p;
85 if (s == end)
86 break;
87 ch = (unsigned char)*s;
88 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020089 if (ch < 0x80) {
90 s++;
91 *p++ = ch;
92 continue;
93 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +010094 }
95
Antoine Pitrouca5f91b2012-05-10 16:36:02 +020096 if (ch < 0xE0) {
97 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
Victor Stinnerab60de42012-11-04 23:59:15 +010098 Py_UCS4 ch2;
Ezio Melottif7ed5d12012-11-04 23:21:38 +020099 if (ch < 0xC2) {
100 /* invalid sequence
101 \x80-\xBF -- continuation byte
102 \xC0-\xC1 -- fake 0000-007F */
103 goto InvalidStart;
104 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200105 if (end - s < 2) {
106 /* unexpected end of data: the caller will decide whether
107 it's an error or not */
108 break;
109 }
110 ch2 = (unsigned char)s[1];
Mark Dickinson106c4142012-06-23 21:45:14 +0100111 if (!IS_CONTINUATION_BYTE(ch2))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200112 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200113 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200114 ch = (ch << 6) + ch2 -
115 ((0xC0 << 6) + 0x80);
116 assert ((ch > 0x007F) && (ch <= 0x07FF));
117 s += 2;
118 if (STRINGLIB_MAX_CHAR <= 0x007F ||
119 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200120 /* Out-of-range */
121 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100122 *p++ = ch;
123 continue;
124 }
125
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200126 if (ch < 0xF0) {
127 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128 Py_UCS4 ch2, ch3;
129 if (end - s < 3) {
130 /* unexpected end of data: the caller will decide whether
131 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200132 if (end - s < 2)
133 break;
134 ch2 = (unsigned char)s[1];
135 if (!IS_CONTINUATION_BYTE(ch2) ||
136 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137 /* for clarification see comments below */
138 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200139 break;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200141 ch2 = (unsigned char)s[1];
142 ch3 = (unsigned char)s[2];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200143 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200144 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200145 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200146 }
147 if (ch == 0xE0) {
148 if (ch2 < 0xA0)
149 /* invalid sequence
150 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200151 goto InvalidContinuation1;
152 } else if (ch == 0xED && ch2 >= 0xA0) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200153 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154 will result in surrogates in range D800-DFFF. Surrogates are
155 not valid UTF-8 so they are rejected.
Benjamin Peterson51796e52020-03-10 21:10:59 -0700156 See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200158 goto InvalidContinuation1;
159 }
160 if (!IS_CONTINUATION_BYTE(ch3)) {
161 /* invalid continuation byte */
162 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200163 }
164 ch = (ch << 12) + (ch2 << 6) + ch3 -
165 ((0xE0 << 12) + (0x80 << 6) + 0x80);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100166 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167 s += 3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200168 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200170 /* Out-of-range */
171 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100172 *p++ = ch;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200173 continue;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100174 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200175
176 if (ch < 0xF5) {
177 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178 Py_UCS4 ch2, ch3, ch4;
179 if (end - s < 4) {
180 /* unexpected end of data: the caller will decide whether
181 it's an error or not */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200182 if (end - s < 2)
183 break;
184 ch2 = (unsigned char)s[1];
185 if (!IS_CONTINUATION_BYTE(ch2) ||
186 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187 /* for clarification see comments below */
188 goto InvalidContinuation1;
189 if (end - s < 3)
190 break;
191 ch3 = (unsigned char)s[2];
192 if (!IS_CONTINUATION_BYTE(ch3))
193 goto InvalidContinuation2;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200194 break;
195 }
196 ch2 = (unsigned char)s[1];
197 ch3 = (unsigned char)s[2];
198 ch4 = (unsigned char)s[3];
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200199 if (!IS_CONTINUATION_BYTE(ch2)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200200 /* invalid continuation byte */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200201 goto InvalidContinuation1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200202 }
203 if (ch == 0xF0) {
204 if (ch2 < 0x90)
205 /* invalid sequence
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200206 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207 goto InvalidContinuation1;
208 } else if (ch == 0xF4 && ch2 >= 0x90) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200209 /* invalid sequence
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300210 \xF4\x90\x80\x80- -- 110000- overflow */
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200211 goto InvalidContinuation1;
212 }
213 if (!IS_CONTINUATION_BYTE(ch3)) {
214 /* invalid continuation byte */
215 goto InvalidContinuation2;
216 }
217 if (!IS_CONTINUATION_BYTE(ch4)) {
218 /* invalid continuation byte */
219 goto InvalidContinuation3;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200220 }
221 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224 s += 4;
225 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200227 /* Out-of-range */
228 goto Return;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200229 *p++ = ch;
230 continue;
231 }
232 goto InvalidStart;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200234 ch = 0;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200235Return:
236 *inptr = s;
237 *outpos = p - dest;
238 return ch;
239InvalidStart:
240 ch = 1;
241 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200242InvalidContinuation1:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +0200243 ch = 2;
244 goto Return;
Ezio Melottif7ed5d12012-11-04 23:21:38 +0200245InvalidContinuation2:
246 ch = 3;
247 goto Return;
248InvalidContinuation3:
249 ch = 4;
250 goto Return;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100251}
252
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100253#undef ASCII_CHAR_MASK
254
Victor Stinner6099a032011-12-18 14:22:26 +0100255
256/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258 UCS-1 strings don't need to handle surrogates for example. */
Inada Naoki02a4d572020-02-27 13:48:59 +0900259Py_LOCAL_INLINE(char *)
260STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
261 PyObject *unicode,
Victor Stinner6099a032011-12-18 14:22:26 +0100262 STRINGLIB_CHAR *data,
263 Py_ssize_t size,
Victor Stinner709d23d2019-05-02 14:56:30 -0400264 _Py_error_handler error_handler,
Victor Stinner6099a032011-12-18 14:22:26 +0100265 const char *errors)
266{
Serhiy Storchaka998c9cd2016-10-30 18:25:27 +0200267 Py_ssize_t i; /* index into data of next input character */
Victor Stinner6099a032011-12-18 14:22:26 +0100268 char *p; /* next free byte in output buffer */
Victor Stinner6099a032011-12-18 14:22:26 +0100269#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200270 PyObject *error_handler_obj = NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100271 PyObject *exc = NULL;
272 PyObject *rep = NULL;
273#endif
274#if STRINGLIB_SIZEOF_CHAR == 1
275 const Py_ssize_t max_char_size = 2;
Victor Stinner6099a032011-12-18 14:22:26 +0100276#elif STRINGLIB_SIZEOF_CHAR == 2
277 const Py_ssize_t max_char_size = 3;
Victor Stinner6099a032011-12-18 14:22:26 +0100278#else /* STRINGLIB_SIZEOF_CHAR == 4 */
279 const Py_ssize_t max_char_size = 4;
Victor Stinner6099a032011-12-18 14:22:26 +0100280#endif
281
282 assert(size >= 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200283 if (size > PY_SSIZE_T_MAX / max_char_size) {
284 /* integer overflow */
Inada Naoki02a4d572020-02-27 13:48:59 +0900285 PyErr_NoMemory();
286 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100287 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200288
Inada Naoki02a4d572020-02-27 13:48:59 +0900289 _PyBytesWriter_Init(writer);
290 p = _PyBytesWriter_Alloc(writer, size * max_char_size);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200291 if (p == NULL)
292 return NULL;
Victor Stinner6099a032011-12-18 14:22:26 +0100293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303#if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305#endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311#if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
Victor Stinner01ada392015-10-01 21:54:51 +0200313 Py_ssize_t startpos, endpos, newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +0200314 Py_ssize_t k;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200315 if (error_handler == _Py_ERROR_UNKNOWN) {
Victor Stinner3d4226a2018-08-29 22:21:32 +0200316 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200317 }
Victor Stinner01ada392015-10-01 21:54:51 +0200318
Victor Stinner6099a032011-12-18 14:22:26 +0100319 startpos = i-1;
Victor Stinner01ada392015-10-01 21:54:51 +0200320 endpos = startpos+1;
Victor Stinner6099a032011-12-18 14:22:26 +0100321
Victor Stinner01ada392015-10-01 21:54:51 +0200322 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
323 endpos++;
Victor Stinner6099a032011-12-18 14:22:26 +0100324
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200325 /* Only overallocate the buffer if it's not the last write */
Inada Naoki02a4d572020-02-27 13:48:59 +0900326 writer->overallocate = (endpos < size);
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200327
Victor Stinner01ada392015-10-01 21:54:51 +0200328 switch (error_handler)
329 {
330 case _Py_ERROR_REPLACE:
331 memset(p, '?', endpos - startpos);
332 p += (endpos - startpos);
Stefan Krahf432a322017-08-21 13:09:59 +0200333 /* fall through */
Victor Stinner01ada392015-10-01 21:54:51 +0200334 case _Py_ERROR_IGNORE:
335 i += (endpos - startpos - 1);
336 break;
Victor Stinner6099a032011-12-18 14:22:26 +0100337
Victor Stinner01ada392015-10-01 21:54:51 +0200338 case _Py_ERROR_SURROGATEPASS:
339 for (k=startpos; k<endpos; k++) {
340 ch = data[k];
341 *p++ = (char)(0xe0 | (ch >> 12));
342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
343 *p++ = (char)(0x80 | (ch & 0x3f));
344 }
345 i += (endpos - startpos - 1);
346 break;
347
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700349 /* subtract preallocated bytes */
Inada Naoki02a4d572020-02-27 13:48:59 +0900350 writer->min_size -= max_char_size * (endpos - startpos);
351 p = backslashreplace(writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 unicode, startpos, endpos);
353 if (p == NULL)
354 goto error;
355 i += (endpos - startpos - 1);
356 break;
357
358 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700359 /* subtract preallocated bytes */
Inada Naoki02a4d572020-02-27 13:48:59 +0900360 writer->min_size -= max_char_size * (endpos - startpos);
361 p = xmlcharrefreplace(writer, p,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200362 unicode, startpos, endpos);
363 if (p == NULL)
364 goto error;
365 i += (endpos - startpos - 1);
366 break;
367
Victor Stinner01ada392015-10-01 21:54:51 +0200368 case _Py_ERROR_SURROGATEESCAPE:
369 for (k=startpos; k<endpos; k++) {
370 ch = data[k];
371 if (!(0xDC80 <= ch && ch <= 0xDCFF))
372 break;
373 *p++ = (char)(ch & 0xff);
374 }
375 if (k >= endpos) {
376 i += (endpos - startpos - 1);
377 break;
378 }
379 startpos = k;
380 assert(startpos < endpos);
Stefan Krahf432a322017-08-21 13:09:59 +0200381 /* fall through */
Victor Stinner01ada392015-10-01 21:54:51 +0200382 default:
383 rep = unicode_encode_call_errorhandler(
384 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
385 unicode, &exc, startpos, endpos, &newpos);
386 if (!rep)
387 goto error;
388
Raymond Hettinger15f44ab2016-08-30 10:47:49 -0700389 /* subtract preallocated bytes */
Inada Naoki02a4d572020-02-27 13:48:59 +0900390 writer->min_size -= max_char_size * (newpos - startpos);
Victor Stinnerad771582015-10-09 12:38:53 +0200391
Victor Stinner01ada392015-10-01 21:54:51 +0200392 if (PyBytes_Check(rep)) {
Inada Naoki02a4d572020-02-27 13:48:59 +0900393 p = _PyBytesWriter_WriteBytes(writer, p,
Victor Stinnerce179bf2015-10-09 12:57:22 +0200394 PyBytes_AS_STRING(rep),
395 PyBytes_GET_SIZE(rep));
Victor Stinner01ada392015-10-01 21:54:51 +0200396 }
397 else {
398 /* rep is unicode */
399 if (PyUnicode_READY(rep) < 0)
400 goto error;
Victor Stinner6099a032011-12-18 14:22:26 +0100401
Victor Stinner01ada392015-10-01 21:54:51 +0200402 if (!PyUnicode_IS_ASCII(rep)) {
Serhiy Storchaka998c9cd2016-10-30 18:25:27 +0200403 raise_encode_exception(&exc, "utf-8", unicode,
404 startpos, endpos,
Victor Stinner6099a032011-12-18 14:22:26 +0100405 "surrogates not allowed");
406 goto error;
407 }
Victor Stinner01ada392015-10-01 21:54:51 +0200408
Inada Naoki02a4d572020-02-27 13:48:59 +0900409 p = _PyBytesWriter_WriteBytes(writer, p,
Victor Stinner6bd525b2015-10-09 13:10:05 +0200410 PyUnicode_DATA(rep),
411 PyUnicode_GET_LENGTH(rep));
Victor Stinner6099a032011-12-18 14:22:26 +0100412 }
Victor Stinner6bd525b2015-10-09 13:10:05 +0200413
414 if (p == NULL)
415 goto error;
Victor Stinner01ada392015-10-01 21:54:51 +0200416 Py_CLEAR(rep);
417
418 i = newpos;
Victor Stinner6099a032011-12-18 14:22:26 +0100419 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200420
421 /* If overallocation was disabled, ensure that it was the last
422 write. Otherwise, we missed an optimization */
Inada Naoki02a4d572020-02-27 13:48:59 +0900423 assert(writer->overallocate || i == size);
Victor Stinner6099a032011-12-18 14:22:26 +0100424 }
425 else
426#if STRINGLIB_SIZEOF_CHAR > 2
427 if (ch < 0x10000)
428#endif
429 {
430 *p++ = (char)(0xe0 | (ch >> 12));
431 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
432 *p++ = (char)(0x80 | (ch & 0x3f));
433 }
434#if STRINGLIB_SIZEOF_CHAR > 2
435 else /* ch >= 0x10000 */
436 {
437 assert(ch <= MAX_UNICODE);
438 /* Encode UCS4 Unicode ordinals */
439 *p++ = (char)(0xf0 | (ch >> 18));
440 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
441 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
442 *p++ = (char)(0x80 | (ch & 0x3f));
443 }
444#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
445#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
446 }
447
Victor Stinner6099a032011-12-18 14:22:26 +0100448#if STRINGLIB_SIZEOF_CHAR > 1
Victor Stinner01ada392015-10-01 21:54:51 +0200449 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100450 Py_XDECREF(exc);
451#endif
Inada Naoki02a4d572020-02-27 13:48:59 +0900452 return p;
Victor Stinner6099a032011-12-18 14:22:26 +0100453
454#if STRINGLIB_SIZEOF_CHAR > 1
455 error:
456 Py_XDECREF(rep);
Victor Stinner01ada392015-10-01 21:54:51 +0200457 Py_XDECREF(error_handler_obj);
Victor Stinner6099a032011-12-18 14:22:26 +0100458 Py_XDECREF(exc);
Victor Stinner6099a032011-12-18 14:22:26 +0100459 return NULL;
460#endif
Victor Stinner6099a032011-12-18 14:22:26 +0100461}
462
Antoine Pitrou63065d72012-05-15 23:48:04 +0200463/* The pattern for constructing UCS2-repeated masks. */
464#if SIZEOF_LONG == 8
465# define UCS2_REPEAT_MASK 0x0001000100010001ul
466#elif SIZEOF_LONG == 4
467# define UCS2_REPEAT_MASK 0x00010001ul
468#else
469# error C 'long' size should be either 4 or 8!
470#endif
471
472/* The mask for fast checking. */
473#if STRINGLIB_SIZEOF_CHAR == 1
474/* The mask for fast checking of whether a C 'long' contains a
475 non-ASCII or non-Latin1 UTF16-encoded characters. */
476# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
477#else
478/* The mask for fast checking of whether a C 'long' may contain
479 UTF16-encoded surrogate characters. This is an efficient heuristic,
480 assuming that non-surrogate characters with a code point >= 0x8000 are
481 rare in most input.
482*/
483# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
484#endif
485/* The mask for fast byte-swapping. */
486#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
487/* Swap bytes. */
488#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
489 (((value) & STRIPPED_MASK) << 8))
490
491Py_LOCAL_INLINE(Py_UCS4)
492STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
493 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
494 int native_ordering)
495{
496 Py_UCS4 ch;
497 const unsigned char *aligned_end =
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200498 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
Antoine Pitrou63065d72012-05-15 23:48:04 +0200499 const unsigned char *q = *inptr;
500 STRINGLIB_CHAR *p = dest + *outpos;
501 /* Offsets from q for retrieving byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +0200502#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200503 int ihi = !!native_ordering, ilo = !native_ordering;
504#else
505 int ihi = !native_ordering, ilo = !!native_ordering;
506#endif
507 --e;
508
509 while (q < e) {
510 Py_UCS4 ch2;
511 /* First check for possible aligned read of a C 'long'. Unaligned
512 reads are more expensive, better to defer to another iteration. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200513 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
Antoine Pitrou63065d72012-05-15 23:48:04 +0200514 /* Fast path for runs of in-range non-surrogate chars. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200515 const unsigned char *_q = q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200516 while (_q < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -0600517 unsigned long block = * (const unsigned long *) _q;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200518 if (native_ordering) {
519 /* Can use buffer directly */
520 if (block & FAST_CHAR_MASK)
521 break;
522 }
523 else {
524 /* Need to byte-swap */
525 if (block & SWAB(FAST_CHAR_MASK))
526 break;
527#if STRINGLIB_SIZEOF_CHAR == 1
528 block >>= 8;
529#else
530 block = SWAB(block);
531#endif
532 }
Christian Heimes743e0cd2012-10-17 23:52:17 +0200533#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +0200534# if SIZEOF_LONG == 4
535 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
536 p[1] = (STRINGLIB_CHAR)(block >> 16);
537# elif SIZEOF_LONG == 8
538 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
539 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
540 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
541 p[3] = (STRINGLIB_CHAR)(block >> 48);
542# endif
543#else
544# if SIZEOF_LONG == 4
545 p[0] = (STRINGLIB_CHAR)(block >> 16);
546 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
547# elif SIZEOF_LONG == 8
548 p[0] = (STRINGLIB_CHAR)(block >> 48);
549 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
550 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
551 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
552# endif
553#endif
554 _q += SIZEOF_LONG;
555 p += SIZEOF_LONG / 2;
556 }
557 q = _q;
558 if (q >= e)
559 break;
560 }
561
562 ch = (q[ihi] << 8) | q[ilo];
563 q += 2;
564 if (!Py_UNICODE_IS_SURROGATE(ch)) {
565#if STRINGLIB_SIZEOF_CHAR < 2
566 if (ch > STRINGLIB_MAX_CHAR)
567 /* Out-of-range */
568 goto Return;
569#endif
570 *p++ = (STRINGLIB_CHAR)ch;
571 continue;
572 }
573
574 /* UTF-16 code pair: */
Antoine Pitrou63065d72012-05-15 23:48:04 +0200575 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
576 goto IllegalEncoding;
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300577 if (q >= e)
578 goto UnexpectedEnd;
Antoine Pitrou63065d72012-05-15 23:48:04 +0200579 ch2 = (q[ihi] << 8) | q[ilo];
580 q += 2;
581 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
582 goto IllegalSurrogate;
583 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
584#if STRINGLIB_SIZEOF_CHAR < 4
585 /* Out-of-range */
586 goto Return;
587#else
588 *p++ = (STRINGLIB_CHAR)ch;
589#endif
590 }
591 ch = 0;
592Return:
593 *inptr = q;
594 *outpos = p - dest;
595 return ch;
596UnexpectedEnd:
597 ch = 1;
598 goto Return;
599IllegalEncoding:
600 ch = 2;
601 goto Return;
602IllegalSurrogate:
603 ch = 3;
604 goto Return;
605}
606#undef UCS2_REPEAT_MASK
607#undef FAST_CHAR_MASK
608#undef STRIPPED_MASK
609#undef SWAB
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200610
611
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200612#if STRINGLIB_MAX_CHAR >= 0x80
613Py_LOCAL_INLINE(Py_ssize_t)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200614STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
615 Py_ssize_t len,
616 unsigned short **outptr,
617 int native_ordering)
618{
619 unsigned short *out = *outptr;
620 const STRINGLIB_CHAR *end = in + len;
621#if STRINGLIB_SIZEOF_CHAR == 1
622 if (native_ordering) {
623 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
624 while (in < unrolled_end) {
625 out[0] = in[0];
626 out[1] = in[1];
627 out[2] = in[2];
628 out[3] = in[3];
629 in += 4; out += 4;
630 }
631 while (in < end) {
632 *out++ = *in++;
633 }
634 } else {
635# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200636 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200637 while (in < unrolled_end) {
638 out[0] = SWAB2(in[0]);
639 out[1] = SWAB2(in[1]);
640 out[2] = SWAB2(in[2]);
641 out[3] = SWAB2(in[3]);
642 in += 4; out += 4;
643 }
644 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200645 Py_UCS4 ch = *in++;
646 *out++ = SWAB2((Py_UCS2)ch);
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200647 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200648#undef SWAB2
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200649 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200650 *outptr = out;
651 return len;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200652#else
653 if (native_ordering) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654#if STRINGLIB_MAX_CHAR < 0x10000
655 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
656 while (in < unrolled_end) {
657 /* check if any character is a surrogate character */
658 if (((in[0] ^ 0xd800) &
659 (in[1] ^ 0xd800) &
660 (in[2] ^ 0xd800) &
661 (in[3] ^ 0xd800) & 0xf800) == 0)
662 break;
663 out[0] = in[0];
664 out[1] = in[1];
665 out[2] = in[2];
666 out[3] = in[3];
667 in += 4; out += 4;
668 }
669#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200670 while (in < end) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200671 Py_UCS4 ch;
672 ch = *in++;
673 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200674 *out++ = ch;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200675 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300676 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200677 goto fail;
678#if STRINGLIB_MAX_CHAR >= 0x10000
679 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200680 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
681 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
682 out += 2;
683 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200684#endif
685 else
686 *out++ = ch;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200687 }
688 } else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200689#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
690#if STRINGLIB_MAX_CHAR < 0x10000
691 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
692 while (in < unrolled_end) {
693 /* check if any character is a surrogate character */
694 if (((in[0] ^ 0xd800) &
695 (in[1] ^ 0xd800) &
696 (in[2] ^ 0xd800) &
697 (in[3] ^ 0xd800) & 0xf800) == 0)
698 break;
699 out[0] = SWAB2(in[0]);
700 out[1] = SWAB2(in[1]);
701 out[2] = SWAB2(in[2]);
702 out[3] = SWAB2(in[3]);
703 in += 4; out += 4;
704 }
705#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200706 while (in < end) {
707 Py_UCS4 ch = *in++;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200708 if (ch < 0xd800)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200709 *out++ = SWAB2((Py_UCS2)ch);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200710 else if (ch < 0xe000)
Serhiy Storchaka7e29eea2015-05-18 22:19:42 +0300711 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200712 goto fail;
713#if STRINGLIB_MAX_CHAR >= 0x10000
714 else if (ch >= 0x10000) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200715 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
716 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
717 out[0] = SWAB2(ch1);
718 out[1] = SWAB2(ch2);
719 out += 2;
720 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200721#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200722 else
723 *out++ = SWAB2((Py_UCS2)ch);
724 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200725#undef SWAB2
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200726 }
727 *outptr = out;
728 return len;
729 fail:
730 *outptr = out;
731 return len - (end - in + 1);
732#endif
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +0200733}
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300734
735#if STRINGLIB_SIZEOF_CHAR == 1
736# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
737#elif STRINGLIB_SIZEOF_CHAR == 2
738# define SWAB4(CH, tmp) (tmp = (CH), \
739 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
740 /* high bytes are zero */
741#else
742# define SWAB4(CH, tmp) (tmp = (CH), \
743 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
744 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
745#endif
746Py_LOCAL_INLINE(Py_ssize_t)
747STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
748 Py_ssize_t len,
749 PY_UINT32_T **outptr,
750 int native_ordering)
751{
752 PY_UINT32_T *out = *outptr;
753 const STRINGLIB_CHAR *end = in + len;
754 if (native_ordering) {
755 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
756 while (in < unrolled_end) {
757#if STRINGLIB_SIZEOF_CHAR > 1
758 /* check if any character is a surrogate character */
759 if (((in[0] ^ 0xd800) &
760 (in[1] ^ 0xd800) &
761 (in[2] ^ 0xd800) &
762 (in[3] ^ 0xd800) & 0xf800) == 0)
763 break;
764#endif
765 out[0] = in[0];
766 out[1] = in[1];
767 out[2] = in[2];
768 out[3] = in[3];
769 in += 4; out += 4;
770 }
771 while (in < end) {
772 Py_UCS4 ch;
773 ch = *in++;
774#if STRINGLIB_SIZEOF_CHAR > 1
775 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300776 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300777 goto fail;
778 }
779#endif
780 *out++ = ch;
781 }
782 } else {
783 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
784 while (in < unrolled_end) {
785#if STRINGLIB_SIZEOF_CHAR > 1
786 Py_UCS4 ch1, ch2, ch3, ch4;
787 /* check if any character is a surrogate character */
788 if (((in[0] ^ 0xd800) &
789 (in[1] ^ 0xd800) &
790 (in[2] ^ 0xd800) &
791 (in[3] ^ 0xd800) & 0xf800) == 0)
792 break;
793#endif
794 out[0] = SWAB4(in[0], ch1);
795 out[1] = SWAB4(in[1], ch2);
796 out[2] = SWAB4(in[2], ch3);
797 out[3] = SWAB4(in[3], ch4);
798 in += 4; out += 4;
799 }
800 while (in < end) {
801 Py_UCS4 ch = *in++;
802#if STRINGLIB_SIZEOF_CHAR > 1
803 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchaka9ce71a62015-05-18 22:20:18 +0300804 /* reject surrogate characters (U+D800-U+DFFF) */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +0300805 goto fail;
806 }
807#endif
808 *out++ = SWAB4(ch, ch);
809 }
810 }
811 *outptr = out;
812 return len;
813#if STRINGLIB_SIZEOF_CHAR > 1
814 fail:
815 *outptr = out;
816 return len - (end - in + 1);
817#endif
818}
819#undef SWAB4
820
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200821#endif