blob: ea7bcabfc64f28c57d1ad42b1c5b244299f8efff [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner59423e32018-11-26 13:40:01 +0100223static inline void
224unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
225 Py_ssize_t start, Py_ssize_t length)
226{
227 assert(0 <= start);
228 assert(kind != PyUnicode_WCHAR_KIND);
229 switch (kind) {
230 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100231 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100232 Py_UCS1 ch = (unsigned char)value;
233 Py_UCS1 *to = (Py_UCS1 *)data + start;
234 memset(to, ch, length);
235 break;
236 }
237 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS2 ch = (Py_UCS2)value;
240 Py_UCS2 *to = (Py_UCS2 *)data + start;
241 const Py_UCS2 *end = to + length;
242 for (; to < end; ++to) *to = ch;
243 break;
244 }
245 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS4 ch = value;
248 Py_UCS4 * to = (Py_UCS4 *)data + start;
249 const Py_UCS4 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 default: Py_UNREACHABLE();
254 }
255}
256
257
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200258/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700259static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200260_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
261
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200262/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265/* Single character Unicode strings in the Latin-1 range are being
266 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200267static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268
Christian Heimes190d79e2008-01-30 11:58:22 +0000269/* Fast detection of the most frequent whitespace characters */
270const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x000C: * FORM FEED */
276/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 1, 1, 1, 1, 1, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x001C: * FILE SEPARATOR */
280/* case 0x001D: * GROUP SEPARATOR */
281/* case 0x001E: * RECORD SEPARATOR */
282/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 1, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000289
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000298};
299
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200300/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200302static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100303static int unicode_modifiable(PyObject *unicode);
304
Victor Stinnerfe226c02011-10-03 03:52:20 +0200305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100307_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200308static PyObject *
309_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
310static PyObject *
311_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
312
313static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000315 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100316 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000317 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
318
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319static void
320raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300321 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100322 PyObject *unicode,
323 Py_ssize_t startpos, Py_ssize_t endpos,
324 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000325
Christian Heimes190d79e2008-01-30 11:58:22 +0000326/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200327static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000329/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000330/* 0x000B, * LINE TABULATION */
331/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000333 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x001C, * FILE SEPARATOR */
336/* 0x001D, * GROUP SEPARATOR */
337/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 1, 1, 1, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000343
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000352};
353
INADA Naoki3ae20562017-01-16 20:41:20 +0900354static int convert_uc(PyObject *obj, void *addr);
355
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300356#include "clinic/unicodeobject.c.h"
357
Victor Stinner3d4226a2018-08-29 22:21:32 +0200358_Py_error_handler
359_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200403#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
404
Victor Stinner910337b2011-10-03 03:20:16 +0200405 PyASCIIObject *ascii;
406 unsigned int kind;
407
Victor Stinner50fe3f82018-10-26 18:47:15 +0200408 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200409
410 ascii = (PyASCIIObject *)op;
411 kind = ascii->state.kind;
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(kind == PyUnicode_1BYTE_KIND);
415 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200416 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200418 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200419 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200420
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 if (ascii->state.compact == 1) {
422 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200423 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200424 || kind == PyUnicode_2BYTE_KIND
425 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200426 ASSERT(ascii->state.ascii == 0);
427 ASSERT(ascii->state.ready == 1);
428 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100429 }
430 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
432
433 data = unicode->data.any;
434 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->length == 0);
436 ASSERT(ascii->hash == -1);
437 ASSERT(ascii->state.compact == 0);
438 ASSERT(ascii->state.ascii == 0);
439 ASSERT(ascii->state.ready == 0);
440 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
441 ASSERT(ascii->wstr != NULL);
442 ASSERT(data == NULL);
443 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200444 }
445 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200446 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 || kind == PyUnicode_2BYTE_KIND
448 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200449 ASSERT(ascii->state.compact == 0);
450 ASSERT(ascii->state.ready == 1);
451 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200452 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200453 ASSERT (compact->utf8 == data);
454 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 }
456 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200457 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 }
460 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 if (
462#if SIZEOF_WCHAR_T == 2
463 kind == PyUnicode_2BYTE_KIND
464#else
465 kind == PyUnicode_4BYTE_KIND
466#endif
467 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200468 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(ascii->wstr == data);
470 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200473 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200474
475 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200477 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 /* check that the best kind is used */
481 if (check_content && kind != PyUnicode_WCHAR_KIND)
482 {
483 Py_ssize_t i;
484 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200485 void *data;
486 Py_UCS4 ch;
487
488 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200489 for (i=0; i < ascii->length; i++)
490 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 if (ch > maxchar)
493 maxchar = ch;
494 }
495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100496 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200497 ASSERT(maxchar >= 128);
498 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100499 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200500 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200501 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 }
Victor Stinner77faf692011-11-20 18:56:05 +0100503 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200504 ASSERT(maxchar >= 0x100);
505 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100506 }
507 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200508 ASSERT(maxchar >= 0x10000);
509 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100510 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200511 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200512 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400513 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200514
515#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516}
Victor Stinner910337b2011-10-03 03:20:16 +0200517#endif
518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002581 if (precision == -1) {
2582 length = strlen(str);
2583 }
2584 else {
2585 length = 0;
2586 while (length < precision && str[length]) {
2587 length++;
2588 }
2589 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2591 if (unicode == NULL)
2592 return -1;
2593
2594 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2595 Py_DECREF(unicode);
2596 return res;
2597}
2598
Victor Stinner96865452011-03-01 23:44:09 +00002599static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002600unicode_fromformat_arg(_PyUnicodeWriter *writer,
2601 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002602{
Victor Stinnere215d962012-10-06 23:03:36 +02002603 const char *p;
2604 Py_ssize_t len;
2605 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t width;
2607 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 int longflag;
2609 int longlongflag;
2610 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002612
2613 p = f;
2614 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002615 zeropad = 0;
2616 if (*f == '0') {
2617 zeropad = 1;
2618 f++;
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620
2621 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 width = -1;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002625 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002626 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002628 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002630 return NULL;
2631 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002633 f++;
2634 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002635 }
2636 precision = -1;
2637 if (*f == '.') {
2638 f++;
2639 if (Py_ISDIGIT((unsigned)*f)) {
2640 precision = (*f - '0');
2641 f++;
2642 while (Py_ISDIGIT((unsigned)*f)) {
2643 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2644 PyErr_SetString(PyExc_ValueError,
2645 "precision too big");
2646 return NULL;
2647 }
2648 precision = (precision * 10) + (*f - '0');
2649 f++;
2650 }
2651 }
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == '%') {
2653 /* "%.3%s" => f points to "3" */
2654 f--;
2655 }
2656 }
2657 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002659 f--;
2660 }
Victor Stinner96865452011-03-01 23:44:09 +00002661
2662 /* Handle %ld, %lu, %lld and %llu. */
2663 longflag = 0;
2664 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002665 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002666 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longflag = 1;
2669 ++f;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 longlongflag = 1;
2674 f += 2;
2675 }
Victor Stinner96865452011-03-01 23:44:09 +00002676 }
2677 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002678 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002679 size_tflag = 1;
2680 ++f;
2681 }
Victor Stinnere215d962012-10-06 23:03:36 +02002682
2683 if (f[1] == '\0')
2684 writer->overallocate = 0;
2685
2686 switch (*f) {
2687 case 'c':
2688 {
2689 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002690 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002691 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002692 "character argument not in range(0x110000)");
2693 return NULL;
2694 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002695 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002696 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002697 break;
2698 }
2699
2700 case 'i':
2701 case 'd':
2702 case 'u':
2703 case 'x':
2704 {
2705 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002706 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002708
2709 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002713 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002714 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002715 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002717 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002718 va_arg(*vargs, size_t));
2719 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, unsigned int));
2722 }
2723 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002724 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
2726 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002730 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002731 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002732 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002733 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002734 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002735 va_arg(*vargs, Py_ssize_t));
2736 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002737 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002738 va_arg(*vargs, int));
2739 }
2740 assert(len >= 0);
2741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 if (precision < len)
2743 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002744
2745 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002746 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2747 return NULL;
2748
Victor Stinnere215d962012-10-06 23:03:36 +02002749 if (width > precision) {
2750 Py_UCS4 fillchar;
2751 fill = width - precision;
2752 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner15a11362012-10-06 23:48:20 +02002757 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002758 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002759 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2760 return NULL;
2761 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763
Victor Stinner4a587072013-11-19 12:54:53 +01002764 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'p':
2770 {
2771 char number[MAX_LONG_LONG_CHARS];
2772
2773 len = sprintf(number, "%p", va_arg(*vargs, void*));
2774 assert(len >= 0);
2775
2776 /* %p is ill-defined: ensure leading 0x. */
2777 if (number[1] == 'X')
2778 number[1] = 'x';
2779 else if (number[1] != 'x') {
2780 memmove(number + 2, number,
2781 strlen(number) + 1);
2782 number[0] = '0';
2783 number[1] = 'x';
2784 len += 2;
2785 }
2786
Victor Stinner4a587072013-11-19 12:54:53 +01002787 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002788 return NULL;
2789 break;
2790 }
2791
2792 case 's':
2793 {
2794 /* UTF-8 */
2795 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002796 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 break;
2799 }
2800
2801 case 'U':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 assert(obj && _PyUnicode_CHECK(obj));
2805
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
2808 break;
2809 }
2810
2811 case 'V':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (obj) {
2816 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
2819 }
2820 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002822 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002823 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002824 }
2825 break;
2826 }
2827
2828 case 'S':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *str;
2832 assert(obj);
2833 str = PyObject_Str(obj);
2834 if (!str)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(str);
2838 return NULL;
2839 }
2840 Py_DECREF(str);
2841 break;
2842 }
2843
2844 case 'R':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *repr;
2848 assert(obj);
2849 repr = PyObject_Repr(obj);
2850 if (!repr)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(repr);
2854 return NULL;
2855 }
2856 Py_DECREF(repr);
2857 break;
2858 }
2859
2860 case 'A':
2861 {
2862 PyObject *obj = va_arg(*vargs, PyObject *);
2863 PyObject *ascii;
2864 assert(obj);
2865 ascii = PyObject_ASCII(obj);
2866 if (!ascii)
2867 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002868 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002869 Py_DECREF(ascii);
2870 return NULL;
2871 }
2872 Py_DECREF(ascii);
2873 break;
2874 }
2875
2876 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002877 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002878 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002879 break;
2880
2881 default:
2882 /* if we stumble upon an unknown formatting code, copy the rest
2883 of the format string to the output string. (we cannot just
2884 skip the code, since there's no way to know what's in the
2885 argument list) */
2886 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002887 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002888 return NULL;
2889 f = p+len;
2890 return f;
2891 }
2892
2893 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002894 return f;
2895}
2896
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897PyObject *
2898PyUnicode_FromFormatV(const char *format, va_list vargs)
2899{
Victor Stinnere215d962012-10-06 23:03:36 +02002900 va_list vargs2;
2901 const char *f;
2902 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002903
Victor Stinner8f674cc2013-04-17 23:02:17 +02002904 _PyUnicodeWriter_Init(&writer);
2905 writer.min_length = strlen(format) + 100;
2906 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002907
Benjamin Peterson0c212142016-09-20 20:39:33 -07002908 // Copy varags to be able to pass a reference to a subfunction.
2909 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 f = unicode_fromformat_arg(&writer, f, &vargs2);
2914 if (f == NULL)
2915 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002918 const char *p;
2919 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920
Victor Stinnere215d962012-10-06 23:03:36 +02002921 p = f;
2922 do
2923 {
2924 if ((unsigned char)*p > 127) {
2925 PyErr_Format(PyExc_ValueError,
2926 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2927 "string, got a non-ASCII byte: 0x%02x",
2928 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002929 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
2931 p++;
2932 }
2933 while (*p != '\0' && *p != '%');
2934 len = p - f;
2935
2936 if (*p == '\0')
2937 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002938
2939 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002940 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002941
2942 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002945 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002946 return _PyUnicodeWriter_Finish(&writer);
2947
2948 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002949 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002950 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954PyObject *
2955PyUnicode_FromFormat(const char *format, ...)
2956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 PyObject* ret;
2958 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959
2960#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002961 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002963 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002964#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002965 ret = PyUnicode_FromFormatV(format, vargs);
2966 va_end(vargs);
2967 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002968}
2969
Serhiy Storchakac46db922018-10-23 22:58:24 +03002970static Py_ssize_t
2971unicode_get_widechar_size(PyObject *unicode)
2972{
2973 Py_ssize_t res;
2974
2975 assert(unicode != NULL);
2976 assert(_PyUnicode_CHECK(unicode));
2977
2978 if (_PyUnicode_WSTR(unicode) != NULL) {
2979 return PyUnicode_WSTR_LENGTH(unicode);
2980 }
2981 assert(PyUnicode_IS_READY(unicode));
2982
2983 res = _PyUnicode_LENGTH(unicode);
2984#if SIZEOF_WCHAR_T == 2
2985 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2986 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2987 const Py_UCS4 *end = s + res;
2988 for (; s < end; ++s) {
2989 if (*s > 0xFFFF) {
2990 ++res;
2991 }
2992 }
2993 }
2994#endif
2995 return res;
2996}
2997
2998static void
2999unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3000{
3001 const wchar_t *wstr;
3002
3003 assert(unicode != NULL);
3004 assert(_PyUnicode_CHECK(unicode));
3005
3006 wstr = _PyUnicode_WSTR(unicode);
3007 if (wstr != NULL) {
3008 memcpy(w, wstr, size * sizeof(wchar_t));
3009 return;
3010 }
3011 assert(PyUnicode_IS_READY(unicode));
3012
3013 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3014 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3015 for (; size--; ++s, ++w) {
3016 *w = *s;
3017 }
3018 }
3019 else {
3020#if SIZEOF_WCHAR_T == 4
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3022 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 *w = *s;
3025 }
3026#else
3027 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3028 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3029 for (; size--; ++s, ++w) {
3030 Py_UCS4 ch = *s;
3031 if (ch > 0xFFFF) {
3032 assert(ch <= MAX_UNICODE);
3033 /* encode surrogate pair in this case */
3034 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3035 if (!size--)
3036 break;
3037 *w = Py_UNICODE_LOW_SURROGATE(ch);
3038 }
3039 else {
3040 *w = ch;
3041 }
3042 }
3043#endif
3044 }
3045}
3046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047#ifdef HAVE_WCHAR_H
3048
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003049/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003050
Victor Stinnerd88d9832011-09-06 02:00:05 +02003051 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003052 character) required to convert the unicode object. Ignore size argument.
3053
Victor Stinnerd88d9832011-09-06 02:00:05 +02003054 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003055 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003056 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003057Py_ssize_t
3058PyUnicode_AsWideChar(PyObject *unicode,
3059 wchar_t *w,
3060 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003061{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003062 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003064 if (unicode == NULL) {
3065 PyErr_BadInternalCall();
3066 return -1;
3067 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003071 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003072
3073 res = unicode_get_widechar_size(unicode);
3074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003076 }
3077
3078 if (size > res) {
3079 size = res + 1;
3080 }
3081 else {
3082 res = size;
3083 }
3084 unicode_copy_as_widechar(unicode, w, size);
3085 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003086}
3087
Victor Stinner137c34c2010-09-29 10:25:54 +00003088wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003089PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003090 Py_ssize_t *size)
3091{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003092 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003093 Py_ssize_t buflen;
3094
3095 if (unicode == NULL) {
3096 PyErr_BadInternalCall();
3097 return NULL;
3098 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003101 return NULL;
3102 }
3103
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 buflen = unicode_get_widechar_size(unicode);
3105 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003106 if (buffer == NULL) {
3107 PyErr_NoMemory();
3108 return NULL;
3109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3111 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003112 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003113 }
3114 else if (wcslen(buffer) != (size_t)buflen) {
3115 PyMem_FREE(buffer);
3116 PyErr_SetString(PyExc_ValueError,
3117 "embedded null character");
3118 return NULL;
3119 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003120 return buffer;
3121}
3122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
Alexander Belopolsky40018472011-02-26 01:02:56 +00003125PyObject *
3126PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003127{
Victor Stinner8faf8212011-12-08 22:14:11 +01003128 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 PyErr_SetString(PyExc_ValueError,
3130 "chr() arg not in range(0x110000)");
3131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003132 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003133
Victor Stinner985a82a2014-01-03 12:53:47 +01003134 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003135}
3136
Alexander Belopolsky40018472011-02-26 01:02:56 +00003137PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003138PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003140 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003142 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003143 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_INCREF(obj);
3146 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003147 }
3148 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 /* For a Unicode subtype that's not a Unicode object,
3150 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003151 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003152 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003154 "Can't convert '%.100s' object to str implicitly",
3155 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003156 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003160PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003163{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003164 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 PyErr_BadInternalCall();
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003172 /* Decoding bytes objects is the most common case and should be fast */
3173 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003174 if (PyBytes_GET_SIZE(obj) == 0)
3175 _Py_RETURN_UNICODE_EMPTY();
3176 v = PyUnicode_Decode(
3177 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3178 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003179 return v;
3180 }
3181
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 PyErr_SetString(PyExc_TypeError,
3184 "decoding str is not supported");
3185 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003186 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003188 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3189 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3190 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003191 "decoding to str: need a bytes-like object, %.80s found",
3192 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003193 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003194 }
Tim Petersced69f82003-09-16 20:30:58 +00003195
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003197 PyBuffer_Release(&buffer);
3198 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003200
Serhiy Storchaka05997252013-01-26 12:14:02 +02003201 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
Victor Stinnerebe17e02016-10-12 13:57:45 +02003206/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3207 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3208 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003209int
3210_Py_normalize_encoding(const char *encoding,
3211 char *lower,
3212 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003214 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003215 char *l;
3216 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003217 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218
Victor Stinner942889a2016-09-05 15:40:10 -07003219 assert(encoding != NULL);
3220
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003221 e = encoding;
3222 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003223 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003224 punct = 0;
3225 while (1) {
3226 char c = *e;
3227 if (c == 0) {
3228 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003229 }
Victor Stinner942889a2016-09-05 15:40:10 -07003230
3231 if (Py_ISALNUM(c) || c == '.') {
3232 if (punct && l != lower) {
3233 if (l == l_end) {
3234 return 0;
3235 }
3236 *l++ = '_';
3237 }
3238 punct = 0;
3239
3240 if (l == l_end) {
3241 return 0;
3242 }
3243 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003246 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003247 }
Victor Stinner942889a2016-09-05 15:40:10 -07003248
3249 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003250 }
3251 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003252 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253}
3254
Alexander Belopolsky40018472011-02-26 01:02:56 +00003255PyObject *
3256PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003257 Py_ssize_t size,
3258 const char *encoding,
3259 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003260{
3261 PyObject *buffer = NULL, *unicode;
3262 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003263 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3264
3265 if (encoding == NULL) {
3266 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3267 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003268
Fred Drakee4315f52000-05-09 19:53:39 +00003269 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003270 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3271 char *lower = buflower;
3272
3273 /* Fast paths */
3274 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3275 lower += 3;
3276 if (*lower == '_') {
3277 /* Match "utf8" and "utf_8" */
3278 lower++;
3279 }
3280
3281 if (lower[0] == '8' && lower[1] == 0) {
3282 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3283 }
3284 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3285 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3286 }
3287 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3288 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3289 }
3290 }
3291 else {
3292 if (strcmp(lower, "ascii") == 0
3293 || strcmp(lower, "us_ascii") == 0) {
3294 return PyUnicode_DecodeASCII(s, size, errors);
3295 }
Steve Dowercc16be82016-09-08 10:35:16 -07003296 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003297 else if (strcmp(lower, "mbcs") == 0) {
3298 return PyUnicode_DecodeMBCS(s, size, errors);
3299 }
3300 #endif
3301 else if (strcmp(lower, "latin1") == 0
3302 || strcmp(lower, "latin_1") == 0
3303 || strcmp(lower, "iso_8859_1") == 0
3304 || strcmp(lower, "iso8859_1") == 0) {
3305 return PyUnicode_DecodeLatin1(s, size, errors);
3306 }
3307 }
Victor Stinner37296e82010-06-10 13:36:23 +00003308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003311 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003312 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003313 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003314 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (buffer == NULL)
3316 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003317 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (unicode == NULL)
3319 goto onError;
3320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003322 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003323 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003324 encoding,
3325 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_DECREF(unicode);
3327 goto onError;
3328 }
3329 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003330 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003331
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 Py_XDECREF(buffer);
3334 return NULL;
3335}
3336
Alexander Belopolsky40018472011-02-26 01:02:56 +00003337PyObject *
3338PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003339 const char *encoding,
3340 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003341{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003344 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 }
3346
Serhiy Storchaka00939072016-10-27 21:05:49 +03003347 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3348 "PyUnicode_AsDecodedObject() is deprecated; "
3349 "use PyCodec_Decode() to decode from str", 1) < 0)
3350 return NULL;
3351
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003352 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003354
3355 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003356 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 const char *encoding,
3362 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003363{
3364 PyObject *v;
3365
3366 if (!PyUnicode_Check(unicode)) {
3367 PyErr_BadArgument();
3368 goto onError;
3369 }
3370
Serhiy Storchaka00939072016-10-27 21:05:49 +03003371 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3372 "PyUnicode_AsDecodedUnicode() is deprecated; "
3373 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3374 return NULL;
3375
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378
3379 /* Decode via the codec registry */
3380 v = PyCodec_Decode(unicode, encoding, errors);
3381 if (v == NULL)
3382 goto onError;
3383 if (!PyUnicode_Check(v)) {
3384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003385 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003386 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003387 encoding,
3388 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_DECREF(v);
3390 goto onError;
3391 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003392 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003393
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395 return NULL;
3396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 Py_ssize_t size,
3401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003405
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003406 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3410 Py_DECREF(unicode);
3411 return v;
3412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
3415PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 const char *encoding,
3417 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003418{
3419 PyObject *v;
3420
3421 if (!PyUnicode_Check(unicode)) {
3422 PyErr_BadArgument();
3423 goto onError;
3424 }
3425
Serhiy Storchaka00939072016-10-27 21:05:49 +03003426 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3427 "PyUnicode_AsEncodedObject() is deprecated; "
3428 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3429 "or PyCodec_Encode() for generic encoding", 1) < 0)
3430 return NULL;
3431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003434
3435 /* Encode via the codec registry */
3436 v = PyCodec_Encode(unicode, encoding, errors);
3437 if (v == NULL)
3438 goto onError;
3439 return v;
3440
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003442 return NULL;
3443}
3444
Victor Stinner1b579672011-12-17 05:47:23 +01003445
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003447unicode_encode_locale(PyObject *unicode, const char *errors,
3448 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003450 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003452 Py_ssize_t wlen;
3453 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3454 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003456 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003458 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003459 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003460 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 return NULL;
3462 }
3463
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003464 char *str;
3465 size_t error_pos;
3466 const char *reason;
3467 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003468 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003469 PyMem_Free(wstr);
3470
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003471 if (res != 0) {
3472 if (res == -2) {
3473 PyObject *exc;
3474 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3475 "locale", unicode,
3476 (Py_ssize_t)error_pos,
3477 (Py_ssize_t)(error_pos+1),
3478 reason);
3479 if (exc != NULL) {
3480 PyCodec_StrictErrors(exc);
3481 Py_DECREF(exc);
3482 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003483 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003484 else if (res == -3) {
3485 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3486 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003487 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003490 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003493 PyObject *bytes = PyBytes_FromString(str);
3494 PyMem_RawFree(str);
3495 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496}
3497
Victor Stinnerad158722010-10-27 00:25:46 +00003498PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003499PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3500{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003501 return unicode_encode_locale(unicode, errors, 1);
3502}
3503
3504PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003505PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003506{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003507 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003508 const _PyCoreConfig *config = &interp->core_config;
3509#if defined(__APPLE__)
3510 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3511#else
Victor Stinner793b5312011-04-27 00:24:21 +02003512 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3513 cannot use it to encode and decode filenames before it is loaded. Load
3514 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003515 implementation of the locale codec until the codec registry is
3516 initialized and the Python codec is loaded. See initfsencoding(). */
3517 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003518 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003519 config->filesystem_encoding,
3520 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003521 }
3522 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003524 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003525 }
Victor Stinnerad158722010-10-27 00:25:46 +00003526#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527}
3528
Alexander Belopolsky40018472011-02-26 01:02:56 +00003529PyObject *
3530PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003531 const char *encoding,
3532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533{
3534 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003535 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 if (!PyUnicode_Check(unicode)) {
3538 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
Fred Drakee4315f52000-05-09 19:53:39 +00003541
Victor Stinner942889a2016-09-05 15:40:10 -07003542 if (encoding == NULL) {
3543 return _PyUnicode_AsUTF8String(unicode, errors);
3544 }
3545
Fred Drakee4315f52000-05-09 19:53:39 +00003546 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003547 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3548 char *lower = buflower;
3549
3550 /* Fast paths */
3551 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3552 lower += 3;
3553 if (*lower == '_') {
3554 /* Match "utf8" and "utf_8" */
3555 lower++;
3556 }
3557
3558 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003560 }
3561 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3562 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3563 }
3564 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3565 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3566 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003567 }
Victor Stinner942889a2016-09-05 15:40:10 -07003568 else {
3569 if (strcmp(lower, "ascii") == 0
3570 || strcmp(lower, "us_ascii") == 0) {
3571 return _PyUnicode_AsASCIIString(unicode, errors);
3572 }
Steve Dowercc16be82016-09-08 10:35:16 -07003573#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else if (strcmp(lower, "mbcs") == 0) {
3575 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3576 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003577#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003578 else if (strcmp(lower, "latin1") == 0 ||
3579 strcmp(lower, "latin_1") == 0 ||
3580 strcmp(lower, "iso_8859_1") == 0 ||
3581 strcmp(lower, "iso8859_1") == 0) {
3582 return _PyUnicode_AsLatin1String(unicode, errors);
3583 }
3584 }
Victor Stinner37296e82010-06-10 13:36:23 +00003585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
3587 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003588 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003590 return NULL;
3591
3592 /* The normal path */
3593 if (PyBytes_Check(v))
3594 return v;
3595
3596 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003597 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003598 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003599 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600
3601 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003602 "encoder %s returned bytearray instead of bytes; "
3603 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 encoding);
3605 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 Py_DECREF(v);
3607 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003608 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003610 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3611 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return b;
3614 }
3615
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003622 return NULL;
3623}
3624
Alexander Belopolsky40018472011-02-26 01:02:56 +00003625PyObject *
3626PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003627 const char *encoding,
3628 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629{
3630 PyObject *v;
3631
3632 if (!PyUnicode_Check(unicode)) {
3633 PyErr_BadArgument();
3634 goto onError;
3635 }
3636
Serhiy Storchaka00939072016-10-27 21:05:49 +03003637 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3638 "PyUnicode_AsEncodedUnicode() is deprecated; "
3639 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3640 return NULL;
3641
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003653 encoding,
3654 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2cba6b82018-01-10 22:46:15 +01003664static PyObject*
3665unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3666 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003668 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003669
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003670 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3671 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003672 return NULL;
3673 }
3674
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003675 wchar_t *wstr;
3676 size_t wlen;
3677 const char *reason;
3678 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003679 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003680 if (res != 0) {
3681 if (res == -2) {
3682 PyObject *exc;
3683 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3684 "locale", str, len,
3685 (Py_ssize_t)wlen,
3686 (Py_ssize_t)(wlen + 1),
3687 reason);
3688 if (exc != NULL) {
3689 PyCodec_StrictErrors(exc);
3690 Py_DECREF(exc);
3691 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003693 else if (res == -3) {
3694 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3695 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003696 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003697 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003698 }
Victor Stinner2f197072011-12-17 07:08:30 +01003699 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003700 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003701
3702 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3703 PyMem_RawFree(wstr);
3704 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003705}
3706
3707PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003708PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3709 const char *errors)
3710{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003711 return unicode_decode_locale(str, len, errors, 1);
3712}
3713
3714PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003715PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716{
3717 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719}
3720
3721
3722PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003723PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003724 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003725 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3726}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727
Christian Heimes5894ba72007-11-04 11:43:14 +00003728PyObject*
3729PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3730{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003731 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003732 const _PyCoreConfig *config = &interp->core_config;
3733#if defined(__APPLE__)
3734 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3735#else
Victor Stinner793b5312011-04-27 00:24:21 +02003736 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3737 cannot use it to encode and decode filenames before it is loaded. Load
3738 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003739 implementation of the locale codec until the codec registry is
3740 initialized and the Python codec is loaded. See initfsencoding(). */
3741 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003742 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003743 config->filesystem_encoding,
3744 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 }
3746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003748 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003749 }
Victor Stinnerad158722010-10-27 00:25:46 +00003750#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003751}
3752
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753
3754int
3755PyUnicode_FSConverter(PyObject* arg, void* addr)
3756{
Brett Cannonec6ce872016-09-06 15:50:29 -07003757 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 PyObject *output = NULL;
3759 Py_ssize_t size;
3760 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003761 if (arg == NULL) {
3762 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003763 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003764 return 1;
3765 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003766 path = PyOS_FSPath(arg);
3767 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003768 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003770 if (PyBytes_Check(path)) {
3771 output = path;
3772 }
3773 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3774 output = PyUnicode_EncodeFSDefault(path);
3775 Py_DECREF(path);
3776 if (!output) {
3777 return 0;
3778 }
3779 assert(PyBytes_Check(output));
3780 }
3781
Victor Stinner0ea2a462010-04-30 00:22:08 +00003782 size = PyBytes_GET_SIZE(output);
3783 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003784 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003785 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786 Py_DECREF(output);
3787 return 0;
3788 }
3789 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003790 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003791}
3792
3793
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794int
3795PyUnicode_FSDecoder(PyObject* arg, void* addr)
3796{
Brett Cannona5711202016-09-06 19:36:01 -07003797 int is_buffer = 0;
3798 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003800 if (arg == NULL) {
3801 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003802 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 return 1;
3804 }
Brett Cannona5711202016-09-06 19:36:01 -07003805
3806 is_buffer = PyObject_CheckBuffer(arg);
3807 if (!is_buffer) {
3808 path = PyOS_FSPath(arg);
3809 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003810 return 0;
3811 }
Brett Cannona5711202016-09-06 19:36:01 -07003812 }
3813 else {
3814 path = arg;
3815 Py_INCREF(arg);
3816 }
3817
3818 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003819 output = path;
3820 }
3821 else if (PyBytes_Check(path) || is_buffer) {
3822 PyObject *path_bytes = NULL;
3823
3824 if (!PyBytes_Check(path) &&
3825 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003826 "path should be string, bytes, or os.PathLike, not %.200s",
3827 Py_TYPE(arg)->tp_name)) {
3828 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003829 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003830 }
3831 path_bytes = PyBytes_FromObject(path);
3832 Py_DECREF(path);
3833 if (!path_bytes) {
3834 return 0;
3835 }
3836 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3837 PyBytes_GET_SIZE(path_bytes));
3838 Py_DECREF(path_bytes);
3839 if (!output) {
3840 return 0;
3841 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003842 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003843 else {
3844 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003845 "path should be string, bytes, or os.PathLike, not %.200s",
3846 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003847 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003848 return 0;
3849 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003850 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003851 Py_DECREF(output);
3852 return 0;
3853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003855 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003856 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857 Py_DECREF(output);
3858 return 0;
3859 }
3860 *(PyObject**)addr = output;
3861 return Py_CLEANUP_SUPPORTED;
3862}
3863
3864
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003865const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003867{
Christian Heimesf3863112007-11-22 07:46:41 +00003868 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003870 if (!PyUnicode_Check(unicode)) {
3871 PyErr_BadArgument();
3872 return NULL;
3873 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003877 if (PyUnicode_UTF8(unicode) == NULL) {
3878 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003879 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 if (bytes == NULL)
3881 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3883 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003884 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 Py_DECREF(bytes);
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003889 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 PyBytes_AS_STRING(bytes),
3891 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 Py_DECREF(bytes);
3893 }
3894
3895 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003896 *psize = PyUnicode_UTF8_LENGTH(unicode);
3897 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003898}
3899
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003900const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3904}
3905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906Py_UNICODE *
3907PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 if (!PyUnicode_Check(unicode)) {
3910 PyErr_BadArgument();
3911 return NULL;
3912 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003913 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3914 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003916 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Serhiy Storchakac46db922018-10-23 22:58:24 +03003919 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3920 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3921 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003924 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3925 if (w == NULL) {
3926 PyErr_NoMemory();
3927 return NULL;
3928 }
3929 unicode_copy_as_widechar(unicode, w, wlen + 1);
3930 _PyUnicode_WSTR(unicode) = w;
3931 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3932 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 }
3934 }
3935 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003937 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938}
3939
Alexander Belopolsky40018472011-02-26 01:02:56 +00003940Py_UNICODE *
3941PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944}
3945
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003946const Py_UNICODE *
3947_PyUnicode_AsUnicode(PyObject *unicode)
3948{
3949 Py_ssize_t size;
3950 const Py_UNICODE *wstr;
3951
3952 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3953 if (wstr && wcslen(wstr) != (size_t)size) {
3954 PyErr_SetString(PyExc_ValueError, "embedded null character");
3955 return NULL;
3956 }
3957 return wstr;
3958}
3959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960
Alexander Belopolsky40018472011-02-26 01:02:56 +00003961Py_ssize_t
3962PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 goto onError;
3967 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003968 if (_PyUnicode_WSTR(unicode) == NULL) {
3969 if (PyUnicode_AsUnicode(unicode) == NULL)
3970 goto onError;
3971 }
3972 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return -1;
3976}
3977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978Py_ssize_t
3979PyUnicode_GetLength(PyObject *unicode)
3980{
Victor Stinner07621332012-06-16 04:53:46 +02003981 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 PyErr_BadArgument();
3983 return -1;
3984 }
Victor Stinner07621332012-06-16 04:53:46 +02003985 if (PyUnicode_READY(unicode) == -1)
3986 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return PyUnicode_GET_LENGTH(unicode);
3988}
3989
3990Py_UCS4
3991PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3992{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003993 void *data;
3994 int kind;
3995
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003996 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003997 PyErr_BadArgument();
3998 return (Py_UCS4)-1;
3999 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004000 if (PyUnicode_READY(unicode) == -1) {
4001 return (Py_UCS4)-1;
4002 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004003 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004004 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return (Py_UCS4)-1;
4006 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004007 data = PyUnicode_DATA(unicode);
4008 kind = PyUnicode_KIND(unicode);
4009 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010}
4011
4012int
4013PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4014{
4015 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004016 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return -1;
4018 }
Victor Stinner488fa492011-12-12 00:01:39 +01004019 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004020 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004021 PyErr_SetString(PyExc_IndexError, "string index out of range");
4022 return -1;
4023 }
Victor Stinner488fa492011-12-12 00:01:39 +01004024 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004025 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004026 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4027 PyErr_SetString(PyExc_ValueError, "character out of range");
4028 return -1;
4029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4031 index, ch);
4032 return 0;
4033}
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035const char *
4036PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004037{
Victor Stinner42cb4622010-09-01 19:39:01 +00004038 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004039}
4040
Victor Stinner554f3f02010-06-16 23:33:54 +00004041/* create or adjust a UnicodeDecodeError */
4042static void
4043make_decode_exception(PyObject **exceptionObject,
4044 const char *encoding,
4045 const char *input, Py_ssize_t length,
4046 Py_ssize_t startpos, Py_ssize_t endpos,
4047 const char *reason)
4048{
4049 if (*exceptionObject == NULL) {
4050 *exceptionObject = PyUnicodeDecodeError_Create(
4051 encoding, input, length, startpos, endpos, reason);
4052 }
4053 else {
4054 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4055 goto onError;
4056 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4057 goto onError;
4058 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4059 goto onError;
4060 }
4061 return;
4062
4063onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004064 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004065}
4066
Steve Dowercc16be82016-09-08 10:35:16 -07004067#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004068static int
4069widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4070{
4071 if (newsize > *size) {
4072 wchar_t *newbuf = *buf;
4073 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4074 PyErr_NoMemory();
4075 return -1;
4076 }
4077 *buf = newbuf;
4078 }
4079 *size = newsize;
4080 return 0;
4081}
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083/* error handling callback helper:
4084 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004085 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 and adjust various state variables.
4087 return 0 on success, -1 on error
4088*/
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004091unicode_decode_call_errorhandler_wchar(
4092 const char *errors, PyObject **errorHandler,
4093 const char *encoding, const char *reason,
4094 const char **input, const char **inend, Py_ssize_t *startinpos,
4095 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004096 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004098 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099
4100 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004103 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 wchar_t *repwstr;
4108 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004131 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133
4134 /* Copy back the bytes variables, which might have been modified by the
4135 callback */
4136 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4137 if (!inputobj)
4138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 *input = PyBytes_AS_STRING(inputobj);
4140 insize = PyBytes_GET_SIZE(inputobj);
4141 *inend = *input + insize;
4142 /* we can DECREF safely, as the exception has another reference,
4143 so the object won't go away. */
4144 Py_DECREF(inputobj);
4145
4146 if (newpos<0)
4147 newpos = insize+newpos;
4148 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004149 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150 goto onError;
4151 }
4152
4153 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4154 if (repwstr == NULL)
4155 goto onError;
4156 /* need more space? (at least enough for what we
4157 have+the replacement+the rest of the string (starting
4158 at the new input position), so we won't have to check space
4159 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004160 requiredsize = *outpos;
4161 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4162 goto overflow;
4163 requiredsize += repwlen;
4164 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4165 goto overflow;
4166 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004167 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004169 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004171 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004173 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004175 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 *endinpos = newpos;
4178 *inptr = *input + newpos;
4179
4180 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004181 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 return 0;
4183
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004184 overflow:
4185 PyErr_SetString(PyExc_OverflowError,
4186 "decoded result is too long for a Python string");
4187
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 onError:
4189 Py_XDECREF(restuple);
4190 return -1;
4191}
Steve Dowercc16be82016-09-08 10:35:16 -07004192#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
4194static int
4195unicode_decode_call_errorhandler_writer(
4196 const char *errors, PyObject **errorHandler,
4197 const char *encoding, const char *reason,
4198 const char **input, const char **inend, Py_ssize_t *startinpos,
4199 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4200 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4201{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004202 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203
4204 PyObject *restuple = NULL;
4205 PyObject *repunicode = NULL;
4206 Py_ssize_t insize;
4207 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004208 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004209 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004211 int need_to_grow = 0;
4212 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213
4214 if (*errorHandler == NULL) {
4215 *errorHandler = PyCodec_LookupError(errors);
4216 if (*errorHandler == NULL)
4217 goto onError;
4218 }
4219
4220 make_decode_exception(exceptionObject,
4221 encoding,
4222 *input, *inend - *input,
4223 *startinpos, *endinpos,
4224 reason);
4225 if (*exceptionObject == NULL)
4226 goto onError;
4227
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004228 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 if (restuple == NULL)
4230 goto onError;
4231 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 goto onError;
4234 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004235 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004237
4238 /* Copy back the bytes variables, which might have been modified by the
4239 callback */
4240 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4241 if (!inputobj)
4242 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004243 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004244 *input = PyBytes_AS_STRING(inputobj);
4245 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004246 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004247 /* we can DECREF safely, as the exception has another reference,
4248 so the object won't go away. */
4249 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004253 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Victor Stinner170ca6f2013-04-18 00:25:28 +02004258 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004259 if (replen > 1) {
4260 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004261 need_to_grow = 1;
4262 }
4263 new_inptr = *input + newpos;
4264 if (*inend - new_inptr > remain) {
4265 /* We don't know the decoding algorithm here so we make the worst
4266 assumption that one byte decodes to one unicode character.
4267 If unfortunately one byte could decode to more unicode characters,
4268 the decoder may write out-of-bound then. Is it possible for the
4269 algorithms using this function? */
4270 writer->min_length += *inend - new_inptr - remain;
4271 need_to_grow = 1;
4272 }
4273 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004274 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004275 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004276 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4277 goto onError;
4278 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004280 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004283 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004286 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292}
4293
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294/* --- UTF-7 Codec -------------------------------------------------------- */
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* See RFC2152 for details. We encode conservatively and decode liberally. */
4297
4298/* Three simple macros defining base-64. */
4299
4300/* Is c a base-64 character? */
4301
4302#define IS_BASE64(c) \
4303 (((c) >= 'A' && (c) <= 'Z') || \
4304 ((c) >= 'a' && (c) <= 'z') || \
4305 ((c) >= '0' && (c) <= '9') || \
4306 (c) == '+' || (c) == '/')
4307
4308/* given that c is a base-64 character, what is its base-64 value? */
4309
4310#define FROM_BASE64(c) \
4311 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4312 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4313 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4314 (c) == '+' ? 62 : 63)
4315
4316/* What is the base-64 character of the bottom 6 bits of n? */
4317
4318#define TO_BASE64(n) \
4319 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4320
4321/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4322 * decoded as itself. We are permissive on decoding; the only ASCII
4323 * byte not decoding to itself is the + which begins a base64
4324 * string. */
4325
4326#define DECODE_DIRECT(c) \
4327 ((c) <= 127 && (c) != '+')
4328
4329/* The UTF-7 encoder treats ASCII characters differently according to
4330 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4331 * the above). See RFC2152. This array identifies these different
4332 * sets:
4333 * 0 : "Set D"
4334 * alphanumeric and '(),-./:?
4335 * 1 : "Set O"
4336 * !"#$%&*;<=>@[]^_`{|}
4337 * 2 : "whitespace"
4338 * ht nl cr sp
4339 * 3 : special (must be base64 encoded)
4340 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4341 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Tim Petersced69f82003-09-16 20:30:58 +00004343static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344char utf7_category[128] = {
4345/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4346 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4347/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4348 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4349/* sp ! " # $ % & ' ( ) * + , - . / */
4350 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4351/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4353/* @ A B C D E F G H I J K L M N O */
4354 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4357/* ` a b c d e f g h i j k l m n o */
4358 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4359/* p q r s t u v w x y z { | } ~ del */
4360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361};
4362
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363/* ENCODE_DIRECT: this character should be encoded as itself. The
4364 * answer depends on whether we are encoding set O as itself, and also
4365 * on whether we are encoding whitespace as itself. RFC2152 makes it
4366 * clear that the answers to these questions vary between
4367 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004368
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369#define ENCODE_DIRECT(c, directO, directWS) \
4370 ((c) < 128 && (c) > 0 && \
4371 ((utf7_category[(c)] == 0) || \
4372 (directWS && (utf7_category[(c)] == 2)) || \
4373 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375PyObject *
4376PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004377 Py_ssize_t size,
4378 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004380 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4381}
4382
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383/* The decoder. The only state we preserve is our read position,
4384 * i.e. how many characters we have consumed. So if we end in the
4385 * middle of a shift sequence we have to back off the read position
4386 * and the output to the beginning of the sequence, otherwise we lose
4387 * all the shift state (seen bits, number of bits seen, high
4388 * surrogate). */
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
4391PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004392 Py_ssize_t size,
4393 const char *errors,
4394 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t startinpos;
4398 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 const char *errmsg = "";
4402 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 unsigned int base64bits = 0;
4405 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004406 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 PyObject *errorHandler = NULL;
4408 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004410 if (size == 0) {
4411 if (consumed)
4412 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004413 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004417 _PyUnicodeWriter_Init(&writer);
4418 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419
4420 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 e = s + size;
4422
4423 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004426 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 if (inShift) { /* in a base-64 section */
4429 if (IS_BASE64(ch)) { /* consume a base-64 character */
4430 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4431 base64bits += 6;
4432 s++;
4433 if (base64bits >= 16) {
4434 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 base64bits -= 16;
4437 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004438 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (surrogate) {
4440 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004441 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4442 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004446 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004449 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004450 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
4453 }
Victor Stinner551ac952011-11-29 22:58:13 +01004454 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* first surrogate */
4456 surrogate = outCh;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004459 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
4462 }
4463 }
4464 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (base64bits > 0) { /* left-over bits */
4467 if (base64bits >= 6) {
4468 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004469 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 errmsg = "partial character in shift sequence";
4471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 else {
4474 /* Some bits remain; they should be zero */
4475 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004476 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 errmsg = "non-zero padding bits in shift sequence";
4478 goto utf7Error;
4479 }
4480 }
4481 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004482 if (surrogate && DECODE_DIRECT(ch)) {
4483 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4484 goto onError;
4485 }
4486 surrogate = 0;
4487 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* '-' is absorbed; other terminating
4489 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004490 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 }
4494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 s++; /* consume '+' */
4497 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004502 else if (s < e && !IS_BASE64(*s)) {
4503 s++;
4504 errmsg = "ill-formed sequence";
4505 goto utf7Error;
4506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004509 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004512 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
4514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else {
4521 startinpos = s-starts;
4522 s++;
4523 errmsg = "unexpected special character";
4524 goto utf7Error;
4525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 errors, &errorHandler,
4531 "utf7", errmsg,
4532 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* end of string */
4538
4539 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4540 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (surrogate ||
4543 (base64bits >= 6) ||
4544 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 errors, &errorHandler,
4548 "utf7", "unterminated shift sequence",
4549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 goto onError;
4552 if (s < e)
4553 goto restart;
4554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556
4557 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004558 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004561 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.kind, writer.data, shiftOutStart);
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 _PyUnicodeWriter_Dealloc(&writer);
4567 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_XDECREF(errorHandler);
4582 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 return NULL;
4585}
4586
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589_PyUnicode_EncodeUTF7(PyObject *str,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004594 int kind;
4595 void *data;
4596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 char * out;
4603 char * start;
4604
Benjamin Petersonbac79492012-01-14 13:34:47 -05004605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004606 return NULL;
4607 kind = PyUnicode_KIND(str);
4608 data = PyUnicode_DATA(str);
4609 len = PyUnicode_GET_LENGTH(str);
4610
4611 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004615 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004616 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004617 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 if (v == NULL)
4619 return NULL;
4620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (inShift) {
4626 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4627 /* shifting out */
4628 if (base64bits) { /* output remaining bits */
4629 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4630 base64buffer = 0;
4631 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
4633 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* Characters not in the BASE64 set implicitly unshift the sequence
4635 so no '-' is required, except if the character is itself a '-' */
4636 if (IS_BASE64(ch) || ch == '-') {
4637 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 *out++ = (char) ch;
4640 }
4641 else {
4642 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* not in a shift sequence */
4646 if (ch == '+') {
4647 *out++ = '+';
4648 *out++ = '-';
4649 }
4650 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 *out++ = '+';
4655 inShift = 1;
4656 goto encode_char;
4657 }
4658 }
4659 continue;
4660encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004662 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* code first surrogate */
4665 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004666 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 while (base64bits >= 6) {
4668 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4669 base64bits -= 6;
4670 }
4671 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004672 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 base64bits += 16;
4675 base64buffer = (base64buffer << 16) | ch;
4676 while (base64bits >= 6) {
4677 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4678 base64bits -= 6;
4679 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 if (base64bits)
4682 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4683 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 if (_PyBytes_Resize(&v, out - start) < 0)
4686 return NULL;
4687 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004689PyObject *
4690PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4691 Py_ssize_t size,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
4695{
4696 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004697 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004698 if (tmp == NULL)
4699 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004700 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 base64WhiteSpace, errors);
4702 Py_DECREF(tmp);
4703 return result;
4704}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706#undef IS_BASE64
4707#undef FROM_BASE64
4708#undef TO_BASE64
4709#undef DECODE_DIRECT
4710#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712/* --- UTF-8 Codec -------------------------------------------------------- */
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
4715PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004716 Py_ssize_t size,
4717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald69652032004-09-07 20:24:22 +00004719 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4720}
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722#include "stringlib/asciilib.h"
4723#include "stringlib/codecs.h"
4724#include "stringlib/undef.h"
4725
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726#include "stringlib/ucs1lib.h"
4727#include "stringlib/codecs.h"
4728#include "stringlib/undef.h"
4729
4730#include "stringlib/ucs2lib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
4734#include "stringlib/ucs4lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738/* Mask to quickly check whether a C 'long' contains a
4739 non-ASCII, UTF8-encoded char. */
4740#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004741# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004742#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004743# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004744#else
4745# error C 'long' size should be either 4 or 8!
4746#endif
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748static Py_ssize_t
4749ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004752 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004754 /*
4755 * Issue #17237: m68k is a bit different from most architectures in
4756 * that objects do not use "natural alignment" - for example, int and
4757 * long are only aligned at 2-byte boundaries. Therefore the assert()
4758 * won't work; also, tests have shown that skipping the "optimised
4759 * version" will even speed up m68k.
4760 */
4761#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004763 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4764 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 /* Fast path, see in STRINGLIB(utf8_decode) for
4766 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004767 /* Help allocation */
4768 const char *_p = p;
4769 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (_p < aligned_end) {
4771 unsigned long value = *(const unsigned long *) _p;
4772 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 *((unsigned long *)q) = value;
4775 _p += SIZEOF_LONG;
4776 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 p = _p;
4779 while (p < end) {
4780 if ((unsigned char)*p & 0x80)
4781 break;
4782 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004787#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (p < end) {
4789 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4790 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004792 /* Help allocation */
4793 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
4797 break;
4798 _p += SIZEOF_LONG;
4799 }
4800 p = _p;
4801 if (_p == end)
4802 break;
4803 }
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 ++p;
4807 }
4808 memcpy(dest, start, p - start);
4809 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
Antoine Pitrouab868312009-01-10 15:40:25 +00004811
Victor Stinner785938e2011-12-11 20:09:03 +01004812PyObject *
4813PyUnicode_DecodeUTF8Stateful(const char *s,
4814 Py_ssize_t size,
4815 const char *errors,
4816 Py_ssize_t *consumed)
4817{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004818 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004819 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821
4822 Py_ssize_t startinpos;
4823 Py_ssize_t endinpos;
4824 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004825 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004827 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
4829 if (size == 0) {
4830 if (consumed)
4831 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004833 }
4834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4836 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004837 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 *consumed = 1;
4839 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004840 }
4841
Victor Stinner8f674cc2013-04-17 23:02:17 +02004842 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004843 writer.min_length = size;
4844 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004846
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004847 writer.pos = ascii_decode(s, end, writer.data);
4848 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 while (s < end) {
4850 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004852
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 if (PyUnicode_IS_ASCII(writer.buffer))
4855 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 } else {
4861 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 }
4864
4865 switch (ch) {
4866 case 0:
4867 if (s == end || consumed)
4868 goto End;
4869 errmsg = "unexpected end of data";
4870 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004871 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 break;
4873 case 1:
4874 errmsg = "invalid start byte";
4875 startinpos = s - starts;
4876 endinpos = startinpos + 1;
4877 break;
4878 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 case 3:
4880 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 errmsg = "invalid continuation byte";
4882 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004883 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 break;
4885 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004886 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 goto onError;
4888 continue;
4889 }
4890
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004892 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004893
4894 switch (error_handler) {
4895 case _Py_ERROR_IGNORE:
4896 s += (endinpos - startinpos);
4897 break;
4898
4899 case _Py_ERROR_REPLACE:
4900 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4901 goto onError;
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004906 {
4907 Py_ssize_t i;
4908
Victor Stinner1d65d912015-10-05 13:43:50 +02004909 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4910 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004911 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004912 ch = (Py_UCS4)(unsigned char)(starts[i]);
4913 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4914 ch + 0xdc00);
4915 writer.pos++;
4916 }
4917 s += (endinpos - startinpos);
4918 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004919 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004920
4921 default:
4922 if (unicode_decode_call_errorhandler_writer(
4923 errors, &error_handler_obj,
4924 "utf-8", errmsg,
4925 &starts, &end, &startinpos, &endinpos, &exc, &s,
4926 &writer))
4927 goto onError;
4928 }
Victor Stinner785938e2011-12-11 20:09:03 +01004929 }
4930
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 if (consumed)
4933 *consumed = s - starts;
4934
Victor Stinner1d65d912015-10-05 13:43:50 +02004935 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004937 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938
4939onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004940 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004942 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004944}
4945
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004947/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4948 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004949
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004950 On success, write a pointer to a newly allocated wide character string into
4951 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4952 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 On memory allocation failure, return -1.
4955
4956 On decoding error (if surrogateescape is zero), return -2. If wlen is
4957 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4958 is not NULL, write the decoding error message into *reason. */
4959int
4960_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004961 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004962{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004963 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004964 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 wchar_t *unicode;
4966 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967
Victor Stinner3d4226a2018-08-29 22:21:32 +02004968 int surrogateescape = 0;
4969 int surrogatepass = 0;
4970 switch (errors)
4971 {
4972 case _Py_ERROR_STRICT:
4973 break;
4974 case _Py_ERROR_SURROGATEESCAPE:
4975 surrogateescape = 1;
4976 break;
4977 case _Py_ERROR_SURROGATEPASS:
4978 surrogatepass = 1;
4979 break;
4980 default:
4981 return -3;
4982 }
4983
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984 /* Note: size will always be longer than the resulting Unicode
4985 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004986 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004987 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004988 }
4989
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004990 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004991 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004992 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004993 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994
4995 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (ch > 0xFF) {
5006#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005007 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005009 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005010 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5012 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5013#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005014 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005016 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005018 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005019
5020 if (surrogateescape) {
5021 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5022 }
5023 else {
5024 /* Is it a valid three-byte code? */
5025 if (surrogatepass
5026 && (e - s) >= 3
5027 && (s[0] & 0xf0) == 0xe0
5028 && (s[1] & 0xc0) == 0x80
5029 && (s[2] & 0xc0) == 0x80)
5030 {
5031 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5032 s += 3;
5033 unicode[outpos++] = ch;
5034 }
5035 else {
5036 PyMem_RawFree(unicode );
5037 if (reason != NULL) {
5038 switch (ch) {
5039 case 0:
5040 *reason = "unexpected end of data";
5041 break;
5042 case 1:
5043 *reason = "invalid start byte";
5044 break;
5045 /* 2, 3, 4 */
5046 default:
5047 *reason = "invalid continuation byte";
5048 break;
5049 }
5050 }
5051 if (wlen != NULL) {
5052 *wlen = s - orig_s;
5053 }
5054 return -2;
5055 }
5056 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005060 if (wlen) {
5061 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005062 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005063 *wstr = unicode;
5064 return 0;
5065}
5066
5067wchar_t*
5068_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5069{
5070 wchar_t *wstr;
5071 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5072 if (res != 0) {
5073 return NULL;
5074 }
5075 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076}
5077
Antoine Pitrouab868312009-01-10 15:40:25 +00005078
Victor Stinnere47e6982017-12-21 15:45:16 +01005079/* UTF-8 encoder using the surrogateescape error handler .
5080
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005081 On success, return 0 and write the newly allocated character string (use
5082 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005083
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005084 On encoding failure, return -2 and write the position of the invalid
5085 surrogate character into *error_pos (if error_pos is set) and the decoding
5086 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005087
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 On memory allocation failure, return -1. */
5089int
5090_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005091 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005092{
5093 const Py_ssize_t max_char_size = 4;
5094 Py_ssize_t len = wcslen(text);
5095
5096 assert(len >= 0);
5097
Victor Stinner3d4226a2018-08-29 22:21:32 +02005098 int surrogateescape = 0;
5099 int surrogatepass = 0;
5100 switch (errors)
5101 {
5102 case _Py_ERROR_STRICT:
5103 break;
5104 case _Py_ERROR_SURROGATEESCAPE:
5105 surrogateescape = 1;
5106 break;
5107 case _Py_ERROR_SURROGATEPASS:
5108 surrogatepass = 1;
5109 break;
5110 default:
5111 return -3;
5112 }
5113
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005114 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5115 return -1;
5116 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005118 if (raw_malloc) {
5119 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005120 }
5121 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005122 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005123 }
5124 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005125 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005126 }
5127
5128 char *p = bytes;
5129 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005130 for (i = 0; i < len; ) {
5131 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005132 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005133 i++;
5134#if Py_UNICODE_SIZE == 2
5135 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5136 && i < len
5137 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5138 {
5139 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5140 i++;
5141 }
5142#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005143
5144 if (ch < 0x80) {
5145 /* Encode ASCII */
5146 *p++ = (char) ch;
5147
5148 }
5149 else if (ch < 0x0800) {
5150 /* Encode Latin-1 */
5151 *p++ = (char)(0xc0 | (ch >> 6));
5152 *p++ = (char)(0x80 | (ch & 0x3f));
5153 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005154 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005155 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005157 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005158 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 if (reason != NULL) {
5161 *reason = "encoding error";
5162 }
5163 if (raw_malloc) {
5164 PyMem_RawFree(bytes);
5165 }
5166 else {
5167 PyMem_Free(bytes);
5168 }
5169 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005170 }
5171 *p++ = (char)(ch & 0xff);
5172 }
5173 else if (ch < 0x10000) {
5174 *p++ = (char)(0xe0 | (ch >> 12));
5175 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5176 *p++ = (char)(0x80 | (ch & 0x3f));
5177 }
5178 else { /* ch >= 0x10000 */
5179 assert(ch <= MAX_UNICODE);
5180 /* Encode UCS4 Unicode ordinals */
5181 *p++ = (char)(0xf0 | (ch >> 18));
5182 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5183 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5184 *p++ = (char)(0x80 | (ch & 0x3f));
5185 }
5186 }
5187 *p++ = '\0';
5188
5189 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005190 char *bytes2;
5191 if (raw_malloc) {
5192 bytes2 = PyMem_RawRealloc(bytes, final_size);
5193 }
5194 else {
5195 bytes2 = PyMem_Realloc(bytes, final_size);
5196 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005197 if (bytes2 == NULL) {
5198 if (error_pos != NULL) {
5199 *error_pos = (size_t)-1;
5200 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (raw_malloc) {
5202 PyMem_RawFree(bytes);
5203 }
5204 else {
5205 PyMem_Free(bytes);
5206 }
5207 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005208 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 *str = bytes2;
5210 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005211}
5212
5213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214/* Primary internal function which creates utf8 encoded bytes objects.
5215
5216 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005217 and allocate exactly as much space needed at the end. Else allocate the
5218 maximum possible needed (4 result bytes per Unicode character), and return
5219 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005220*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005221PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005222_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Victor Stinner6099a032011-12-18 14:22:26 +01005224 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005225 void *data;
5226 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005228 if (!PyUnicode_Check(unicode)) {
5229 PyErr_BadArgument();
5230 return NULL;
5231 }
5232
5233 if (PyUnicode_READY(unicode) == -1)
5234 return NULL;
5235
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005236 if (PyUnicode_UTF8(unicode))
5237 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5238 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005239
5240 kind = PyUnicode_KIND(unicode);
5241 data = PyUnicode_DATA(unicode);
5242 size = PyUnicode_GET_LENGTH(unicode);
5243
Benjamin Petersonead6b532011-12-20 17:23:42 -06005244 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005245 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005246 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005247 case PyUnicode_1BYTE_KIND:
5248 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5249 assert(!PyUnicode_IS_ASCII(unicode));
5250 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5251 case PyUnicode_2BYTE_KIND:
5252 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5253 case PyUnicode_4BYTE_KIND:
5254 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
Alexander Belopolsky40018472011-02-26 01:02:56 +00005258PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005259PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5260 Py_ssize_t size,
5261 const char *errors)
5262{
5263 PyObject *v, *unicode;
5264
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005265 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005266 if (unicode == NULL)
5267 return NULL;
5268 v = _PyUnicode_AsUTF8String(unicode, errors);
5269 Py_DECREF(unicode);
5270 return v;
5271}
5272
5273PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005274PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005276 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277}
5278
Walter Dörwald41980ca2007-08-16 21:55:45 +00005279/* --- UTF-32 Codec ------------------------------------------------------- */
5280
5281PyObject *
5282PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 Py_ssize_t size,
5284 const char *errors,
5285 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286{
5287 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5288}
5289
5290PyObject *
5291PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 Py_ssize_t size,
5293 const char *errors,
5294 int *byteorder,
5295 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296{
5297 const char *starts = s;
5298 Py_ssize_t startinpos;
5299 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005301 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005304 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 PyObject *errorHandler = NULL;
5306 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005307
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 q = (unsigned char *)s;
5309 e = q + size;
5310
5311 if (byteorder)
5312 bo = *byteorder;
5313
5314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005319 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (bom == 0x0000FEFF) {
5321 bo = -1;
5322 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 else if (bom == 0xFFFE0000) {
5325 bo = 1;
5326 q += 4;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005330 }
5331
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005335 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005336 }
5337
Victor Stinnere64322e2012-10-30 23:12:47 +01005338#ifdef WORDS_BIGENDIAN
5339 le = bo < 0;
5340#else
5341 le = bo <= 0;
5342#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005344
Victor Stinner8f674cc2013-04-17 23:02:17 +02005345 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005346 writer.min_length = (e - q + 3) / 4;
5347 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005348 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005349
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 while (1) {
5351 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005353
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005355 enum PyUnicode_Kind kind = writer.kind;
5356 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005357 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005359 if (le) {
5360 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005361 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 if (ch > maxch)
5363 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005364 if (kind != PyUnicode_1BYTE_KIND &&
5365 Py_UNICODE_IS_SURROGATE(ch))
5366 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005368 q += 4;
5369 } while (q <= last);
5370 }
5371 else {
5372 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005373 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 if (ch > maxch)
5375 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005376 if (kind != PyUnicode_1BYTE_KIND &&
5377 Py_UNICODE_IS_SURROGATE(ch))
5378 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005379 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005380 q += 4;
5381 } while (q <= last);
5382 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005383 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005384 }
5385
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005386 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005387 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005388 startinpos = ((const char *)q) - starts;
5389 endinpos = startinpos + 4;
5390 }
5391 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005392 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005394 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005396 startinpos = ((const char *)q) - starts;
5397 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005399 else {
5400 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005401 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005402 goto onError;
5403 q += 4;
5404 continue;
5405 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005406 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005407 startinpos = ((const char *)q) - starts;
5408 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005410
5411 /* The remaining input chars are ignored if the callback
5412 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005413 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005415 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005417 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005423
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424 Py_XDECREF(errorHandler);
5425 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005429 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 Py_XDECREF(errorHandler);
5431 Py_XDECREF(exc);
5432 return NULL;
5433}
5434
5435PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436_PyUnicode_EncodeUTF32(PyObject *str,
5437 const char *errors,
5438 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 enum PyUnicode_Kind kind;
5441 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005442 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005443 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005444 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005445#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005446 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005447#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005449#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005451 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 PyObject *errorHandler = NULL;
5453 PyObject *exc = NULL;
5454 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005455
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005456 if (!PyUnicode_Check(str)) {
5457 PyErr_BadArgument();
5458 return NULL;
5459 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005460 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005461 return NULL;
5462 kind = PyUnicode_KIND(str);
5463 data = PyUnicode_DATA(str);
5464 len = PyUnicode_GET_LENGTH(str);
5465
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005466 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005467 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005468 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005469 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 if (v == NULL)
5471 return NULL;
5472
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 /* output buffer is 4-bytes aligned */
5474 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005475 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005478 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005480
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005481 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 else
5486 encoding = "utf-32";
5487
5488 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005489 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5490 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005491 }
5492
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 pos = 0;
5494 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496
5497 if (kind == PyUnicode_2BYTE_KIND) {
5498 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5499 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 else {
5502 assert(kind == PyUnicode_4BYTE_KIND);
5503 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5504 &out, native_ordering);
5505 }
5506 if (pos == len)
5507 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005508
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 rep = unicode_encode_call_errorhandler(
5510 errors, &errorHandler,
5511 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005513 if (!rep)
5514 goto error;
5515
5516 if (PyBytes_Check(rep)) {
5517 repsize = PyBytes_GET_SIZE(rep);
5518 if (repsize & 3) {
5519 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005520 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 "surrogates not allowed");
5522 goto error;
5523 }
5524 moreunits = repsize / 4;
5525 }
5526 else {
5527 assert(PyUnicode_Check(rep));
5528 if (PyUnicode_READY(rep) < 0)
5529 goto error;
5530 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5531 if (!PyUnicode_IS_ASCII(rep)) {
5532 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005533 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005534 "surrogates not allowed");
5535 goto error;
5536 }
5537 }
5538
5539 /* four bytes are reserved for each surrogate */
5540 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005541 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005542 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 /* integer overflow */
5544 PyErr_NoMemory();
5545 goto error;
5546 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005547 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005548 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005549 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005550 }
5551
5552 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005553 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005554 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005557 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5558 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 }
5560
5561 Py_CLEAR(rep);
5562 }
5563
5564 /* Cut back to size actually needed. This is necessary for, for example,
5565 encoding of a string containing isolated surrogates and the 'ignore'
5566 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005567 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 if (nsize != PyBytes_GET_SIZE(v))
5569 _PyBytes_Resize(&v, nsize);
5570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005573 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005574 error:
5575 Py_XDECREF(rep);
5576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
5578 Py_XDECREF(v);
5579 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005580}
5581
Alexander Belopolsky40018472011-02-26 01:02:56 +00005582PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005583PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5584 Py_ssize_t size,
5585 const char *errors,
5586 int byteorder)
5587{
5588 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005589 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005590 if (tmp == NULL)
5591 return NULL;
5592 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5593 Py_DECREF(tmp);
5594 return result;
5595}
5596
5597PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005598PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599{
Victor Stinnerb960b342011-11-20 19:12:52 +01005600 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005601}
5602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603/* --- UTF-16 Codec ------------------------------------------------------- */
5604
Tim Peters772747b2001-08-09 22:21:55 +00005605PyObject *
5606PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 Py_ssize_t size,
5608 const char *errors,
5609 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610{
Walter Dörwald69652032004-09-07 20:24:22 +00005611 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5612}
5613
5614PyObject *
5615PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 Py_ssize_t size,
5617 const char *errors,
5618 int *byteorder,
5619 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005620{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005622 Py_ssize_t startinpos;
5623 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005626 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005628 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 PyObject *errorHandler = NULL;
5630 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005631 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Tim Peters772747b2001-08-09 22:21:55 +00005633 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005637 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005639 /* Check for BOM marks (U+FEFF) in the input and adjust current
5640 byte order setting accordingly. In native mode, the leading BOM
5641 mark is skipped, in all other modes, it is copied to the output
5642 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 if (bo == 0 && size >= 2) {
5644 const Py_UCS4 bom = (q[1] << 8) | q[0];
5645 if (bom == 0xFEFF) {
5646 q += 2;
5647 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 else if (bom == 0xFFFE) {
5650 q += 2;
5651 bo = 1;
5652 }
5653 if (byteorder)
5654 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Antoine Pitrou63065d72012-05-15 23:48:04 +02005657 if (q == e) {
5658 if (consumed)
5659 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005660 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005661 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005662
Christian Heimes743e0cd2012-10-17 23:52:17 +02005663#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005664 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005666#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005668 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005669#endif
Tim Peters772747b2001-08-09 22:21:55 +00005670
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005672 character count normally. Error handler will take care of
5673 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005674 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005675 writer.min_length = (e - q + 1) / 2;
5676 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005678
Antoine Pitrou63065d72012-05-15 23:48:04 +02005679 while (1) {
5680 Py_UCS4 ch = 0;
5681 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005682 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005683 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005685 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005687 native_ordering);
5688 else
5689 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691 native_ordering);
5692 } else if (kind == PyUnicode_2BYTE_KIND) {
5693 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005695 native_ordering);
5696 } else {
5697 assert(kind == PyUnicode_4BYTE_KIND);
5698 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005700 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005701 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703
Antoine Pitrou63065d72012-05-15 23:48:04 +02005704 switch (ch)
5705 {
5706 case 0:
5707 /* remaining byte at the end? (size should be even) */
5708 if (q == e || consumed)
5709 goto End;
5710 errmsg = "truncated data";
5711 startinpos = ((const char *)q) - starts;
5712 endinpos = ((const char *)e) - starts;
5713 break;
5714 /* The remaining input chars are ignored if the callback
5715 chooses to skip the input */
5716 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005717 q -= 2;
5718 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005719 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005720 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005721 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005722 endinpos = ((const char *)e) - starts;
5723 break;
5724 case 2:
5725 errmsg = "illegal encoding";
5726 startinpos = ((const char *)q) - 2 - starts;
5727 endinpos = startinpos + 2;
5728 break;
5729 case 3:
5730 errmsg = "illegal UTF-16 surrogate";
5731 startinpos = ((const char *)q) - 4 - starts;
5732 endinpos = startinpos + 2;
5733 break;
5734 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005735 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 continue;
5738 }
5739
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005740 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005741 errors,
5742 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005744 &starts,
5745 (const char **)&e,
5746 &startinpos,
5747 &endinpos,
5748 &exc,
5749 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 }
5753
Antoine Pitrou63065d72012-05-15 23:48:04 +02005754End:
Walter Dörwald69652032004-09-07 20:24:22 +00005755 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005760 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005763 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 Py_XDECREF(errorHandler);
5765 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
5767}
5768
Tim Peters772747b2001-08-09 22:21:55 +00005769PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005770_PyUnicode_EncodeUTF16(PyObject *str,
5771 const char *errors,
5772 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005774 enum PyUnicode_Kind kind;
5775 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005777 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005778 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005779 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005780#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005781 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005782#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005783 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005784#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785 const char *encoding;
5786 Py_ssize_t nsize, pos;
5787 PyObject *errorHandler = NULL;
5788 PyObject *exc = NULL;
5789 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005790
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005791 if (!PyUnicode_Check(str)) {
5792 PyErr_BadArgument();
5793 return NULL;
5794 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005795 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005796 return NULL;
5797 kind = PyUnicode_KIND(str);
5798 data = PyUnicode_DATA(str);
5799 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005800
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005802 if (kind == PyUnicode_4BYTE_KIND) {
5803 const Py_UCS4 *in = (const Py_UCS4 *)data;
5804 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005805 while (in < end) {
5806 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005807 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005808 }
5809 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005810 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005811 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005813 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 nsize = len + pairs + (byteorder == 0);
5815 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005816 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005820 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005821 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005822 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005823 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005824 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005825 }
5826 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005827 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005828 }
Tim Peters772747b2001-08-09 22:21:55 +00005829
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 if (kind == PyUnicode_1BYTE_KIND) {
5831 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5832 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005833 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005834
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005835 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005837 }
5838 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005840 }
5841 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005843 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844
5845 pos = 0;
5846 while (pos < len) {
5847 Py_ssize_t repsize, moreunits;
5848
5849 if (kind == PyUnicode_2BYTE_KIND) {
5850 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5851 &out, native_ordering);
5852 }
5853 else {
5854 assert(kind == PyUnicode_4BYTE_KIND);
5855 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5856 &out, native_ordering);
5857 }
5858 if (pos == len)
5859 break;
5860
5861 rep = unicode_encode_call_errorhandler(
5862 errors, &errorHandler,
5863 encoding, "surrogates not allowed",
5864 str, &exc, pos, pos + 1, &pos);
5865 if (!rep)
5866 goto error;
5867
5868 if (PyBytes_Check(rep)) {
5869 repsize = PyBytes_GET_SIZE(rep);
5870 if (repsize & 1) {
5871 raise_encode_exception(&exc, encoding,
5872 str, pos - 1, pos,
5873 "surrogates not allowed");
5874 goto error;
5875 }
5876 moreunits = repsize / 2;
5877 }
5878 else {
5879 assert(PyUnicode_Check(rep));
5880 if (PyUnicode_READY(rep) < 0)
5881 goto error;
5882 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5883 if (!PyUnicode_IS_ASCII(rep)) {
5884 raise_encode_exception(&exc, encoding,
5885 str, pos - 1, pos,
5886 "surrogates not allowed");
5887 goto error;
5888 }
5889 }
5890
5891 /* two bytes are reserved for each surrogate */
5892 if (moreunits > 1) {
5893 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005894 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005895 /* integer overflow */
5896 PyErr_NoMemory();
5897 goto error;
5898 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005899 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 goto error;
5901 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5902 }
5903
5904 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005905 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005906 out += moreunits;
5907 } else /* rep is unicode */ {
5908 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5909 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5910 &out, native_ordering);
5911 }
5912
5913 Py_CLEAR(rep);
5914 }
5915
5916 /* Cut back to size actually needed. This is necessary for, for example,
5917 encoding of a string containing isolated surrogates and the 'ignore' handler
5918 is used. */
5919 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5920 if (nsize != PyBytes_GET_SIZE(v))
5921 _PyBytes_Resize(&v, nsize);
5922 Py_XDECREF(errorHandler);
5923 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005924 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005926 error:
5927 Py_XDECREF(rep);
5928 Py_XDECREF(errorHandler);
5929 Py_XDECREF(exc);
5930 Py_XDECREF(v);
5931 return NULL;
5932#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
Alexander Belopolsky40018472011-02-26 01:02:56 +00005935PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5937 Py_ssize_t size,
5938 const char *errors,
5939 int byteorder)
5940{
5941 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005942 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 if (tmp == NULL)
5944 return NULL;
5945 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5946 Py_DECREF(tmp);
5947 return result;
5948}
5949
5950PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005951PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954}
5955
5956/* --- Unicode Escape Codec ----------------------------------------------- */
5957
Fredrik Lundh06d12682001-01-24 07:59:11 +00005958static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005959
Alexander Belopolsky40018472011-02-26 01:02:56 +00005960PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005961_PyUnicode_DecodeUnicodeEscape(const char *s,
5962 Py_ssize_t size,
5963 const char *errors,
5964 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005967 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 PyObject *errorHandler = NULL;
5970 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971
Eric V. Smith42454af2016-10-31 09:22:08 -04005972 // so we can remember if we've seen an invalid escape char or not
5973 *first_invalid_escape = NULL;
5974
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005976 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 }
5978 /* Escaped strings will always be longer than the resulting
5979 Unicode string, so we start with size here and then reduce the
5980 length after conversion to the true value.
5981 (but if the error callback returns a long replacement string
5982 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005983 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 writer.min_length = size;
5985 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5986 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 }
5988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 end = s + size;
5990 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 unsigned char c = (unsigned char) *s++;
5992 Py_UCS4 ch;
5993 int count;
5994 Py_ssize_t startinpos;
5995 Py_ssize_t endinpos;
5996 const char *message;
5997
5998#define WRITE_ASCII_CHAR(ch) \
5999 do { \
6000 assert(ch <= 127); \
6001 assert(writer.pos < writer.size); \
6002 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6003 } while(0)
6004
6005#define WRITE_CHAR(ch) \
6006 do { \
6007 if (ch <= writer.maxchar) { \
6008 assert(writer.pos < writer.size); \
6009 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6010 } \
6011 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6012 goto onError; \
6013 } \
6014 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
6016 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 if (c != '\\') {
6018 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 continue;
6020 }
6021
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 if (s >= end) {
6025 message = "\\ at end of string";
6026 goto error;
6027 }
6028 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006029
Victor Stinner62ec3312016-09-06 17:04:34 -07006030 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006031 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 case '\n': continue;
6035 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6036 case '\'': WRITE_ASCII_CHAR('\''); continue;
6037 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6038 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006039 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006040 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6041 case 't': WRITE_ASCII_CHAR('\t'); continue;
6042 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6043 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006044 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006046 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 case '0': case '1': case '2': case '3':
6051 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006052 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006053 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 ch = (ch<<3) + *s++ - '0';
6055 if (s < end && '0' <= *s && *s <= '7') {
6056 ch = (ch<<3) + *s++ - '0';
6057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 WRITE_CHAR(ch);
6060 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* hex escapes */
6063 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 message = "truncated \\xXX escape";
6067 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 message = "truncated \\uXXXX escape";
6073 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006076 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 message = "truncated \\UXXXXXXXX escape";
6079 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 ch <<= 4;
6083 if (c >= '0' && c <= '9') {
6084 ch += c - '0';
6085 }
6086 else if (c >= 'a' && c <= 'f') {
6087 ch += c - ('a' - 10);
6088 }
6089 else if (c >= 'A' && c <= 'F') {
6090 ch += c - ('A' - 10);
6091 }
6092 else {
6093 break;
6094 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006095 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006098 }
6099
6100 /* when we get here, ch is a 32-bit unicode character */
6101 if (ch > MAX_UNICODE) {
6102 message = "illegal Unicode character";
6103 goto error;
6104 }
6105
6106 WRITE_CHAR(ch);
6107 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006110 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006111 if (ucnhash_CAPI == NULL) {
6112 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006113 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6114 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 if (ucnhash_CAPI == NULL) {
6116 PyErr_SetString(
6117 PyExc_UnicodeError,
6118 "\\N escapes not supported (can't load unicodedata module)"
6119 );
6120 goto onError;
6121 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006122 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006123
6124 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006125 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006126 const char *start = ++s;
6127 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006128 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006130 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 namelen = s - start;
6132 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006133 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006134 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 ch = 0xffffffff; /* in case 'getcode' messes up */
6136 if (namelen <= INT_MAX &&
6137 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6138 &ch, 0)) {
6139 assert(ch <= MAX_UNICODE);
6140 WRITE_CHAR(ch);
6141 continue;
6142 }
6143 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006144 }
6145 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006146 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006147
6148 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006149 if (*first_invalid_escape == NULL) {
6150 *first_invalid_escape = s-1; /* Back up one char, since we've
6151 already incremented s. */
6152 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 WRITE_ASCII_CHAR('\\');
6154 WRITE_CHAR(c);
6155 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006157
6158 error:
6159 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006161 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006162 errors, &errorHandler,
6163 "unicodeescape", message,
6164 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006166 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006168 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006169
6170#undef WRITE_ASCII_CHAR
6171#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006173
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006174 Py_XDECREF(errorHandler);
6175 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006176 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006177
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006179 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 Py_XDECREF(errorHandler);
6181 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183}
6184
Eric V. Smith42454af2016-10-31 09:22:08 -04006185PyObject *
6186PyUnicode_DecodeUnicodeEscape(const char *s,
6187 Py_ssize_t size,
6188 const char *errors)
6189{
6190 const char *first_invalid_escape;
6191 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6192 &first_invalid_escape);
6193 if (result == NULL)
6194 return NULL;
6195 if (first_invalid_escape != NULL) {
6196 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6197 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006198 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006199 Py_DECREF(result);
6200 return NULL;
6201 }
6202 }
6203 return result;
6204}
6205
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006206/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
Alexander Belopolsky40018472011-02-26 01:02:56 +00006208PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Ezio Melottie7f90372012-10-05 03:33:31 +03006218 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006219 escape.
6220
Ezio Melottie7f90372012-10-05 03:33:31 +03006221 For UCS1 strings it's '\xxx', 4 bytes per source character.
6222 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6223 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006224 */
6225
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 if (!PyUnicode_Check(unicode)) {
6227 PyErr_BadArgument();
6228 return NULL;
6229 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 }
Victor Stinner358af132015-10-12 22:36:57 +02006233
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006234 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 if (len == 0) {
6236 return PyBytes_FromStringAndSize(NULL, 0);
6237 }
6238
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006239 kind = PyUnicode_KIND(unicode);
6240 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6242 bytes, and 1 byte characters 4. */
6243 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006244 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 return PyErr_NoMemory();
6246 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006247 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 if (repr == NULL) {
6249 return NULL;
6250 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006253 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006255
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 /* U+0000-U+00ff range */
6257 if (ch < 0x100) {
6258 if (ch >= ' ' && ch < 127) {
6259 if (ch != '\\') {
6260 /* Copy printable US ASCII as-is */
6261 *p++ = (char) ch;
6262 }
6263 /* Escape backslashes */
6264 else {
6265 *p++ = '\\';
6266 *p++ = '\\';
6267 }
6268 }
Victor Stinner358af132015-10-12 22:36:57 +02006269
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 /* Map special whitespace to '\t', \n', '\r' */
6271 else if (ch == '\t') {
6272 *p++ = '\\';
6273 *p++ = 't';
6274 }
6275 else if (ch == '\n') {
6276 *p++ = '\\';
6277 *p++ = 'n';
6278 }
6279 else if (ch == '\r') {
6280 *p++ = '\\';
6281 *p++ = 'r';
6282 }
6283
6284 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6285 else {
6286 *p++ = '\\';
6287 *p++ = 'x';
6288 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6289 *p++ = Py_hexdigits[ch & 0x000F];
6290 }
Tim Petersced69f82003-09-16 20:30:58 +00006291 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006292 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 *p++ = '\\';
6295 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006296 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6297 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6298 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6299 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6302 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006303
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 /* Make sure that the first two digits are zero */
6305 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006306 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 *p++ = 'U';
6308 *p++ = '0';
6309 *p++ = '0';
6310 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6311 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6312 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6313 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6314 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6315 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 assert(p - PyBytes_AS_STRING(repr) > 0);
6320 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6321 return NULL;
6322 }
6323 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324}
6325
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006327PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6328 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006331 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 }
6335
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006336 result = PyUnicode_AsUnicodeEscapeString(tmp);
6337 Py_DECREF(tmp);
6338 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339}
6340
6341/* --- Raw Unicode Escape Codec ------------------------------------------- */
6342
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343PyObject *
6344PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 Py_ssize_t size,
6346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 PyObject *errorHandler = NULL;
6352 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006353
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006355 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 /* Escaped strings will always be longer than the resulting
6359 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 length after conversion to the true value. (But decoding error
6361 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006362 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 writer.min_length = size;
6364 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6365 goto onError;
6366 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 end = s + size;
6369 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006370 unsigned char c = (unsigned char) *s++;
6371 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006372 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 Py_ssize_t startinpos;
6374 Py_ssize_t endinpos;
6375 const char *message;
6376
6377#define WRITE_CHAR(ch) \
6378 do { \
6379 if (ch <= writer.maxchar) { \
6380 assert(writer.pos < writer.size); \
6381 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6382 } \
6383 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6384 goto onError; \
6385 } \
6386 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 if (c != '\\' || s >= end) {
6390 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006392 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006393
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 c = (unsigned char) *s++;
6395 if (c == 'u') {
6396 count = 4;
6397 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 else if (c == 'U') {
6400 count = 8;
6401 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 }
6403 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 assert(writer.pos < writer.size);
6405 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6406 WRITE_CHAR(c);
6407 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006408 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 startinpos = s - starts - 2;
6410
6411 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6412 for (ch = 0; count && s < end; ++s, --count) {
6413 c = (unsigned char)*s;
6414 ch <<= 4;
6415 if (c >= '0' && c <= '9') {
6416 ch += c - '0';
6417 }
6418 else if (c >= 'a' && c <= 'f') {
6419 ch += c - ('a' - 10);
6420 }
6421 else if (c >= 'A' && c <= 'F') {
6422 ch += c - ('A' - 10);
6423 }
6424 else {
6425 break;
6426 }
6427 }
6428 if (!count) {
6429 if (ch <= MAX_UNICODE) {
6430 WRITE_CHAR(ch);
6431 continue;
6432 }
6433 message = "\\Uxxxxxxxx out of range";
6434 }
6435
6436 endinpos = s-starts;
6437 writer.min_length = end - s + writer.pos;
6438 if (unicode_decode_call_errorhandler_writer(
6439 errors, &errorHandler,
6440 "rawunicodeescape", message,
6441 &starts, &end, &startinpos, &endinpos, &exc, &s,
6442 &writer)) {
6443 goto onError;
6444 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006445 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006446
6447#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006451 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006454 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459}
6460
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006468 int kind;
6469 void *data;
6470 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006472 if (!PyUnicode_Check(unicode)) {
6473 PyErr_BadArgument();
6474 return NULL;
6475 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006477 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 kind = PyUnicode_KIND(unicode);
6480 data = PyUnicode_DATA(unicode);
6481 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 if (kind == PyUnicode_1BYTE_KIND) {
6483 return PyBytes_FromStringAndSize(data, len);
6484 }
Victor Stinner0e368262011-11-10 20:12:49 +01006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6487 bytes, and 1 byte characters 4. */
6488 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006489
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 if (len > PY_SSIZE_T_MAX / expandsize) {
6491 return PyErr_NoMemory();
6492 }
6493 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6494 if (repr == NULL) {
6495 return NULL;
6496 }
6497 if (len == 0) {
6498 return repr;
6499 }
6500
6501 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006502 for (pos = 0; pos < len; pos++) {
6503 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006504
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6506 if (ch < 0x100) {
6507 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006508 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006509 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 *p++ = '\\';
6512 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006513 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6514 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6515 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6516 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6519 else {
6520 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6521 *p++ = '\\';
6522 *p++ = 'U';
6523 *p++ = '0';
6524 *p++ = '0';
6525 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6526 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6527 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6528 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6529 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6530 *p++ = Py_hexdigits[ch & 15];
6531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006533
Victor Stinner62ec3312016-09-06 17:04:34 -07006534 assert(p > PyBytes_AS_STRING(repr));
6535 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6536 return NULL;
6537 }
6538 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Alexander Belopolsky40018472011-02-26 01:02:56 +00006541PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006542PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6543 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006545 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006546 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006547 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006548 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006549 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6550 Py_DECREF(tmp);
6551 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554/* --- Unicode Internal Codec ------------------------------------------- */
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556PyObject *
6557_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006558 Py_ssize_t size,
6559 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006560{
6561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006562 Py_ssize_t startinpos;
6563 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006564 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006565 const char *end;
6566 const char *reason;
6567 PyObject *errorHandler = NULL;
6568 PyObject *exc = NULL;
6569
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006571 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572 1))
6573 return NULL;
6574
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006575 if (size < 0) {
6576 PyErr_BadInternalCall();
6577 return NULL;
6578 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006579 if (size == 0)
6580 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006581
Victor Stinner8f674cc2013-04-17 23:02:17 +02006582 _PyUnicodeWriter_Init(&writer);
6583 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6584 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006586 }
6587 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588
Victor Stinner8f674cc2013-04-17 23:02:17 +02006589 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006591 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006592 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006593 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006594 endinpos = end-starts;
6595 reason = "truncated input";
6596 goto error;
6597 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006598 /* We copy the raw representation one byte at a time because the
6599 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006600 ((char *) &uch)[0] = s[0];
6601 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006602#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006603 ((char *) &uch)[2] = s[2];
6604 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006605#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006606 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006607#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006608 /* We have to sanity check the raw data, otherwise doom looms for
6609 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006610 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006611 endinpos = s - starts + Py_UNICODE_SIZE;
6612 reason = "illegal code point (> 0x10FFFF)";
6613 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006614 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006615#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006616 s += Py_UNICODE_SIZE;
6617#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006618 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006619 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006620 Py_UNICODE uch2;
6621 ((char *) &uch2)[0] = s[0];
6622 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006623 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006624 {
Victor Stinner551ac952011-11-29 22:58:13 +01006625 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006626 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006627 }
6628 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006629#endif
6630
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006631 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006632 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006633 continue;
6634
6635 error:
6636 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006637 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006638 errors, &errorHandler,
6639 "unicode_internal", reason,
6640 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006641 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006642 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006643 }
6644
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006645 Py_XDECREF(errorHandler);
6646 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006647 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006650 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006651 Py_XDECREF(errorHandler);
6652 Py_XDECREF(exc);
6653 return NULL;
6654}
6655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656/* --- Latin-1 Codec ------------------------------------------------------ */
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
6659PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 Py_ssize_t size,
6661 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006664 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668static void
6669make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006670 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006671 PyObject *unicode,
6672 Py_ssize_t startpos, Py_ssize_t endpos,
6673 const char *reason)
6674{
6675 if (*exceptionObject == NULL) {
6676 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006678 encoding, unicode, startpos, endpos, reason);
6679 }
6680 else {
6681 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6682 goto onError;
6683 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6684 goto onError;
6685 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6686 goto onError;
6687 return;
6688 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006689 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006690 }
6691}
6692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006694static void
6695raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006696 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006697 PyObject *unicode,
6698 Py_ssize_t startpos, Py_ssize_t endpos,
6699 const char *reason)
6700{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006701 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006702 encoding, unicode, startpos, endpos, reason);
6703 if (*exceptionObject != NULL)
6704 PyCodec_StrictErrors(*exceptionObject);
6705}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706
6707/* error handling callback helper:
6708 build arguments, call the callback and check the arguments,
6709 put the result into newpos and return the replacement string, which
6710 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006711static PyObject *
6712unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006713 PyObject **errorHandler,
6714 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006716 Py_ssize_t startpos, Py_ssize_t endpos,
6717 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006719 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 PyObject *restuple;
6722 PyObject *resunicode;
6723
6724 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 }
6729
Benjamin Petersonbac79492012-01-14 13:34:47 -05006730 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 return NULL;
6732 len = PyUnicode_GET_LENGTH(unicode);
6733
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006734 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006739 restuple = PyObject_CallFunctionObjArgs(
6740 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006743 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006744 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 Py_DECREF(restuple);
6746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006748 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 &resunicode, newpos)) {
6750 Py_DECREF(restuple);
6751 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006753 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6754 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6755 Py_DECREF(restuple);
6756 return NULL;
6757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 *newpos = len + *newpos;
6760 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006761 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 Py_DECREF(restuple);
6763 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006764 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 Py_INCREF(resunicode);
6766 Py_DECREF(restuple);
6767 return resunicode;
6768}
6769
Alexander Belopolsky40018472011-02-26 01:02:56 +00006770static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006772 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006773 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006774{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006775 /* input state */
6776 Py_ssize_t pos=0, size;
6777 int kind;
6778 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006779 /* pointer into the output */
6780 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006781 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6782 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006783 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006785 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006786 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006787 /* output object */
6788 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789
Benjamin Petersonbac79492012-01-14 13:34:47 -05006790 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791 return NULL;
6792 size = PyUnicode_GET_LENGTH(unicode);
6793 kind = PyUnicode_KIND(unicode);
6794 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 /* allocate enough for a simple encoding without
6796 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006797 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006798 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006799
6800 _PyBytesWriter_Init(&writer);
6801 str = _PyBytesWriter_Alloc(&writer, size);
6802 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006803 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006806 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006807
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006809 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006811 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006813 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006815 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006818 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006820
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006821 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006823
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006824 /* Only overallocate the buffer if it's not the last write */
6825 writer.overallocate = (collend < size);
6826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006828 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006829 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006830
6831 switch (error_handler) {
6832 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006833 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006835
6836 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006837 memset(str, '?', collend - collstart);
6838 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006839 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006840 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 break;
Victor Stinner50149202015-09-22 00:26:54 +02006843
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006844 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006845 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006846 writer.min_size -= (collend - collstart);
6847 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006848 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006849 if (str == NULL)
6850 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006851 pos = collend;
6852 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006853
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006854 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006855 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006856 writer.min_size -= (collend - collstart);
6857 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006858 unicode, collstart, collend);
6859 if (str == NULL)
6860 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006861 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 break;
Victor Stinner50149202015-09-22 00:26:54 +02006863
Victor Stinnerc3713e92015-09-29 12:32:13 +02006864 case _Py_ERROR_SURROGATEESCAPE:
6865 for (i = collstart; i < collend; ++i) {
6866 ch = PyUnicode_READ(kind, data, i);
6867 if (ch < 0xdc80 || 0xdcff < ch) {
6868 /* Not a UTF-8b surrogate */
6869 break;
6870 }
6871 *str++ = (char)(ch - 0xdc00);
6872 ++pos;
6873 }
6874 if (i >= collend)
6875 break;
6876 collstart = pos;
6877 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006878 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006879
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006881 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6882 encoding, reason, unicode, &exc,
6883 collstart, collend, &newpos);
6884 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006886
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006887 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006888 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006889
Victor Stinner6bd525b2015-10-09 13:10:05 +02006890 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006891 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006892 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006893 PyBytes_AS_STRING(rep),
6894 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006895 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006896 else {
6897 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006898
Victor Stinner6bd525b2015-10-09 13:10:05 +02006899 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006901
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006902 if (limit == 256 ?
6903 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6904 !PyUnicode_IS_ASCII(rep))
6905 {
6906 /* Not all characters are smaller than limit */
6907 raise_encode_exception(&exc, encoding, unicode,
6908 collstart, collend, reason);
6909 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006911 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6912 str = _PyBytesWriter_WriteBytes(&writer, str,
6913 PyUnicode_DATA(rep),
6914 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006916 if (str == NULL)
6917 goto onError;
6918
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006919 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006920 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006921 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006922
6923 /* If overallocation was disabled, ensure that it was the last
6924 write. Otherwise, we missed an optimization */
6925 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926 }
6927 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006928
Victor Stinner50149202015-09-22 00:26:54 +02006929 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006931 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006932
6933 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006934 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006935 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006936 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006937 Py_XDECREF(exc);
6938 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939}
6940
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006941/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006942PyObject *
6943PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006944 Py_ssize_t size,
6945 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006947 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006948 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006949 if (unicode == NULL)
6950 return NULL;
6951 result = unicode_encode_ucs1(unicode, errors, 256);
6952 Py_DECREF(unicode);
6953 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954}
6955
Alexander Belopolsky40018472011-02-26 01:02:56 +00006956PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958{
6959 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 PyErr_BadArgument();
6961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 if (PyUnicode_READY(unicode) == -1)
6964 return NULL;
6965 /* Fast path: if it is a one-byte string, construct
6966 bytes object directly. */
6967 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6968 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6969 PyUnicode_GET_LENGTH(unicode));
6970 /* Non-Latin-1 characters present. Defer to above function to
6971 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006972 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006973}
6974
6975PyObject*
6976PyUnicode_AsLatin1String(PyObject *unicode)
6977{
6978 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979}
6980
6981/* --- 7-bit ASCII Codec -------------------------------------------------- */
6982
Alexander Belopolsky40018472011-02-26 01:02:56 +00006983PyObject *
6984PyUnicode_DecodeASCII(const char *s,
6985 Py_ssize_t size,
6986 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006990 int kind;
6991 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006992 Py_ssize_t startinpos;
6993 Py_ssize_t endinpos;
6994 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006995 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006996 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006997 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006998 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006999
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007001 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007004 if (size == 1 && (unsigned char)s[0] < 128)
7005 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007006
Victor Stinner8f674cc2013-04-17 23:02:17 +02007007 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007008 writer.min_length = size;
7009 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007010 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007011
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007014 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 writer.pos = outpos;
7016 if (writer.pos == size)
7017 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007018
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 s += writer.pos;
7020 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007022 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 PyUnicode_WRITE(kind, data, writer.pos, c);
7025 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007027 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007029
7030 /* byte outsize range 0x00..0x7f: call the error handler */
7031
7032 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007033 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007034
7035 switch (error_handler)
7036 {
7037 case _Py_ERROR_REPLACE:
7038 case _Py_ERROR_SURROGATEESCAPE:
7039 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007040 but we may switch to UCS2 at the first write */
7041 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7042 goto onError;
7043 kind = writer.kind;
7044 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007045
7046 if (error_handler == _Py_ERROR_REPLACE)
7047 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7048 else
7049 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7050 writer.pos++;
7051 ++s;
7052 break;
7053
7054 case _Py_ERROR_IGNORE:
7055 ++s;
7056 break;
7057
7058 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 startinpos = s-starts;
7060 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007061 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007062 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 "ascii", "ordinal not in range(128)",
7064 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007065 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007067 kind = writer.kind;
7068 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007071 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007073 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007076 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007077 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007078 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 return NULL;
7080}
7081
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007083PyObject *
7084PyUnicode_EncodeASCII(const Py_UNICODE *p,
7085 Py_ssize_t size,
7086 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007089 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007090 if (unicode == NULL)
7091 return NULL;
7092 result = unicode_encode_ucs1(unicode, errors, 128);
7093 Py_DECREF(unicode);
7094 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095}
7096
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
7100 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 PyErr_BadArgument();
7102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007104 if (PyUnicode_READY(unicode) == -1)
7105 return NULL;
7106 /* Fast path: if it is an ASCII-only string, construct bytes object
7107 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007108 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007109 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7110 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007111 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112}
7113
7114PyObject *
7115PyUnicode_AsASCIIString(PyObject *unicode)
7116{
7117 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118}
7119
Steve Dowercc16be82016-09-08 10:35:16 -07007120#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007121
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007122/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007123
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007124#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125#define NEED_RETRY
7126#endif
7127
Victor Stinner3a50e702011-10-18 21:21:00 +02007128#ifndef WC_ERR_INVALID_CHARS
7129# define WC_ERR_INVALID_CHARS 0x0080
7130#endif
7131
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007132static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007133code_page_name(UINT code_page, PyObject **obj)
7134{
7135 *obj = NULL;
7136 if (code_page == CP_ACP)
7137 return "mbcs";
7138 if (code_page == CP_UTF7)
7139 return "CP_UTF7";
7140 if (code_page == CP_UTF8)
7141 return "CP_UTF8";
7142
7143 *obj = PyBytes_FromFormat("cp%u", code_page);
7144 if (*obj == NULL)
7145 return NULL;
7146 return PyBytes_AS_STRING(*obj);
7147}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148
Victor Stinner3a50e702011-10-18 21:21:00 +02007149static DWORD
7150decode_code_page_flags(UINT code_page)
7151{
7152 if (code_page == CP_UTF7) {
7153 /* The CP_UTF7 decoder only supports flags=0 */
7154 return 0;
7155 }
7156 else
7157 return MB_ERR_INVALID_CHARS;
7158}
7159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 * Decode a byte string from a Windows code page into unicode object in strict
7162 * mode.
7163 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007164 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7165 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007168decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007169 wchar_t **buf,
7170 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 const char *in,
7172 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173{
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007175 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177
7178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 assert(insize > 0);
7180 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7181 if (outsize <= 0)
7182 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007184 /* Extend a wchar_t* buffer */
7185 Py_ssize_t n = *bufsize; /* Get the current length */
7186 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7187 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007189 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190
7191 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7193 if (outsize <= 0)
7194 goto error;
7195 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197error:
7198 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7199 return -2;
7200 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007201 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202}
7203
Victor Stinner3a50e702011-10-18 21:21:00 +02007204/*
7205 * Decode a byte string from a code page into unicode object with an error
7206 * handler.
7207 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007208 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 * UnicodeDecodeError exception and returns -1 on error.
7210 */
7211static int
7212decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007213 wchar_t **buf,
7214 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007216 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007217{
7218 const char *startin = in;
7219 const char *endin = in + size;
7220 const DWORD flags = decode_code_page_flags(code_page);
7221 /* Ideally, we should get reason from FormatMessage. This is the Windows
7222 2000 English version of the message. */
7223 const char *reason = "No mapping for the Unicode character exists "
7224 "in the target code page.";
7225 /* each step cannot decode more than 1 character, but a character can be
7226 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007227 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007228 int insize;
7229 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 PyObject *errorHandler = NULL;
7231 PyObject *exc = NULL;
7232 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007233 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 DWORD err;
7235 int ret = -1;
7236
7237 assert(size > 0);
7238
7239 encoding = code_page_name(code_page, &encoding_obj);
7240 if (encoding == NULL)
7241 return -1;
7242
Victor Stinner7d00cc12014-03-17 23:08:06 +01007243 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7245 UnicodeDecodeError. */
7246 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7247 if (exc != NULL) {
7248 PyCodec_StrictErrors(exc);
7249 Py_CLEAR(exc);
7250 }
7251 goto error;
7252 }
7253
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007254 /* Extend a wchar_t* buffer */
7255 Py_ssize_t n = *bufsize; /* Get the current length */
7256 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7257 PyErr_NoMemory();
7258 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007260 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7261 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007263 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007264
7265 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 while (in < endin)
7267 {
7268 /* Decode a character */
7269 insize = 1;
7270 do
7271 {
7272 outsize = MultiByteToWideChar(code_page, flags,
7273 in, insize,
7274 buffer, Py_ARRAY_LENGTH(buffer));
7275 if (outsize > 0)
7276 break;
7277 err = GetLastError();
7278 if (err != ERROR_NO_UNICODE_TRANSLATION
7279 && err != ERROR_INSUFFICIENT_BUFFER)
7280 {
7281 PyErr_SetFromWindowsErr(0);
7282 goto error;
7283 }
7284 insize++;
7285 }
7286 /* 4=maximum length of a UTF-8 sequence */
7287 while (insize <= 4 && (in + insize) <= endin);
7288
7289 if (outsize <= 0) {
7290 Py_ssize_t startinpos, endinpos, outpos;
7291
Victor Stinner7d00cc12014-03-17 23:08:06 +01007292 /* last character in partial decode? */
7293 if (in + insize >= endin && !final)
7294 break;
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 startinpos = in - startin;
7297 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007298 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007299 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 errors, &errorHandler,
7301 encoding, reason,
7302 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007303 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 {
7305 goto error;
7306 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007307 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 }
7309 else {
7310 in += insize;
7311 memcpy(out, buffer, outsize * sizeof(wchar_t));
7312 out += outsize;
7313 }
7314 }
7315
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007316 /* Shrink the buffer */
7317 assert(out - *buf <= *bufsize);
7318 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007319 /* (in - startin) <= size and size is an int */
7320 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007321
7322error:
7323 Py_XDECREF(encoding_obj);
7324 Py_XDECREF(errorHandler);
7325 Py_XDECREF(exc);
7326 return ret;
7327}
7328
Victor Stinner3a50e702011-10-18 21:21:00 +02007329static PyObject *
7330decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 const char *s, Py_ssize_t size,
7332 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007334 wchar_t *buf = NULL;
7335 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (code_page < 0) {
7339 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7340 return NULL;
7341 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007342 if (size < 0) {
7343 PyErr_BadInternalCall();
7344 return NULL;
7345 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007346
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 do
7351 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 if (size > INT_MAX) {
7354 chunk_size = INT_MAX;
7355 final = 0;
7356 done = 0;
7357 }
7358 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 {
7361 chunk_size = (int)size;
7362 final = (consumed == NULL);
7363 done = 1;
7364 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007368 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007372 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 s, chunk_size);
7374 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007377 errors, final);
7378 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007379
7380 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 return NULL;
7383 }
7384
7385 if (consumed)
7386 *consumed += converted;
7387
7388 s += converted;
7389 size -= converted;
7390 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007391
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7393 PyMem_Free(buf);
7394 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395}
7396
Alexander Belopolsky40018472011-02-26 01:02:56 +00007397PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007398PyUnicode_DecodeCodePageStateful(int code_page,
7399 const char *s,
7400 Py_ssize_t size,
7401 const char *errors,
7402 Py_ssize_t *consumed)
7403{
7404 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7405}
7406
7407PyObject *
7408PyUnicode_DecodeMBCSStateful(const char *s,
7409 Py_ssize_t size,
7410 const char *errors,
7411 Py_ssize_t *consumed)
7412{
7413 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7414}
7415
7416PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007417PyUnicode_DecodeMBCS(const char *s,
7418 Py_ssize_t size,
7419 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007420{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7422}
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static DWORD
7425encode_code_page_flags(UINT code_page, const char *errors)
7426{
7427 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007428 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 }
7430 else if (code_page == CP_UTF7) {
7431 /* CP_UTF7 only supports flags=0 */
7432 return 0;
7433 }
7434 else {
7435 if (errors != NULL && strcmp(errors, "replace") == 0)
7436 return 0;
7437 else
7438 return WC_NO_BEST_FIT_CHARS;
7439 }
7440}
7441
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 * Encode a Unicode string to a Windows code page into a byte string in strict
7444 * mode.
7445 *
7446 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007447 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007450encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453{
Victor Stinner554f3f02010-06-16 23:33:54 +00007454 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 BOOL *pusedDefaultChar = &usedDefaultChar;
7456 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007457 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 const DWORD flags = encode_code_page_flags(code_page, NULL);
7460 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 /* Create a substring so that we can get the UTF-16 representation
7462 of just the slice under consideration. */
7463 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007468 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007470 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007471
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 substring = PyUnicode_Substring(unicode, offset, offset+len);
7473 if (substring == NULL)
7474 return -1;
7475 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7476 if (p == NULL) {
7477 Py_DECREF(substring);
7478 return -1;
7479 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007480 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007484 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 NULL, 0,
7486 NULL, pusedDefaultChar);
7487 if (outsize <= 0)
7488 goto error;
7489 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 if (pusedDefaultChar && *pusedDefaultChar) {
7491 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 if (*outbytes == NULL) {
7499 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503 }
7504 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 const Py_ssize_t n = PyBytes_Size(*outbytes);
7507 if (outsize > PY_SSIZE_T_MAX - n) {
7508 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7513 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007517 }
7518
7519 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007521 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 out, outsize,
7523 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 if (outsize <= 0)
7526 goto error;
7527 if (pusedDefaultChar && *pusedDefaultChar)
7528 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007529 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007530
Victor Stinner3a50e702011-10-18 21:21:00 +02007531error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7534 return -2;
7535 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007536 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007537}
7538
Victor Stinner3a50e702011-10-18 21:21:00 +02007539/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007540 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 * error handler.
7542 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007543 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 * -1 on other error.
7545 */
7546static int
7547encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007548 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007550{
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 Py_ssize_t pos = unicode_offset;
7553 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 /* Ideally, we should get reason from FormatMessage. This is the Windows
7555 2000 English version of the message. */
7556 const char *reason = "invalid character";
7557 /* 4=maximum length of a UTF-8 sequence */
7558 char buffer[4];
7559 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7560 Py_ssize_t outsize;
7561 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 PyObject *errorHandler = NULL;
7563 PyObject *exc = NULL;
7564 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007565 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 PyObject *rep;
7568 int ret = -1;
7569
7570 assert(insize > 0);
7571
7572 encoding = code_page_name(code_page, &encoding_obj);
7573 if (encoding == NULL)
7574 return -1;
7575
7576 if (errors == NULL || strcmp(errors, "strict") == 0) {
7577 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7578 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007579 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (exc != NULL) {
7581 PyCodec_StrictErrors(exc);
7582 Py_DECREF(exc);
7583 }
7584 Py_XDECREF(encoding_obj);
7585 return -1;
7586 }
7587
7588 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7589 pusedDefaultChar = &usedDefaultChar;
7590 else
7591 pusedDefaultChar = NULL;
7592
7593 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7594 PyErr_NoMemory();
7595 goto error;
7596 }
7597 outsize = insize * Py_ARRAY_LENGTH(buffer);
7598
7599 if (*outbytes == NULL) {
7600 /* Create string object */
7601 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7602 if (*outbytes == NULL)
7603 goto error;
7604 out = PyBytes_AS_STRING(*outbytes);
7605 }
7606 else {
7607 /* Extend string object */
7608 Py_ssize_t n = PyBytes_Size(*outbytes);
7609 if (n > PY_SSIZE_T_MAX - outsize) {
7610 PyErr_NoMemory();
7611 goto error;
7612 }
7613 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7614 goto error;
7615 out = PyBytes_AS_STRING(*outbytes) + n;
7616 }
7617
7618 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7622 wchar_t chars[2];
7623 int charsize;
7624 if (ch < 0x10000) {
7625 chars[0] = (wchar_t)ch;
7626 charsize = 1;
7627 }
7628 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007629 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7630 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 charsize = 2;
7632 }
7633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 buffer, Py_ARRAY_LENGTH(buffer),
7637 NULL, pusedDefaultChar);
7638 if (outsize > 0) {
7639 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7640 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 memcpy(out, buffer, outsize);
7643 out += outsize;
7644 continue;
7645 }
7646 }
7647 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7648 PyErr_SetFromWindowsErr(0);
7649 goto error;
7650 }
7651
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 rep = unicode_encode_call_errorhandler(
7653 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 if (rep == NULL)
7657 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007659
7660 if (PyBytes_Check(rep)) {
7661 outsize = PyBytes_GET_SIZE(rep);
7662 if (outsize != 1) {
7663 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7664 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7665 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7666 Py_DECREF(rep);
7667 goto error;
7668 }
7669 out = PyBytes_AS_STRING(*outbytes) + offset;
7670 }
7671 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7672 out += outsize;
7673 }
7674 else {
7675 Py_ssize_t i;
7676 enum PyUnicode_Kind kind;
7677 void *data;
7678
Benjamin Petersonbac79492012-01-14 13:34:47 -05007679 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 Py_DECREF(rep);
7681 goto error;
7682 }
7683
7684 outsize = PyUnicode_GET_LENGTH(rep);
7685 if (outsize != 1) {
7686 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7687 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7688 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7689 Py_DECREF(rep);
7690 goto error;
7691 }
7692 out = PyBytes_AS_STRING(*outbytes) + offset;
7693 }
7694 kind = PyUnicode_KIND(rep);
7695 data = PyUnicode_DATA(rep);
7696 for (i=0; i < outsize; i++) {
7697 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7698 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007699 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007700 encoding, unicode,
7701 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 "unable to encode error handler result to ASCII");
7703 Py_DECREF(rep);
7704 goto error;
7705 }
7706 *out = (unsigned char)ch;
7707 out++;
7708 }
7709 }
7710 Py_DECREF(rep);
7711 }
7712 /* write a NUL byte */
7713 *out = 0;
7714 outsize = out - PyBytes_AS_STRING(*outbytes);
7715 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7716 if (_PyBytes_Resize(outbytes, outsize) < 0)
7717 goto error;
7718 ret = 0;
7719
7720error:
7721 Py_XDECREF(encoding_obj);
7722 Py_XDECREF(errorHandler);
7723 Py_XDECREF(exc);
7724 return ret;
7725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727static PyObject *
7728encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 const char *errors)
7731{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007736
Victor Stinner29dacf22015-01-26 16:41:32 +01007737 if (!PyUnicode_Check(unicode)) {
7738 PyErr_BadArgument();
7739 return NULL;
7740 }
7741
Benjamin Petersonbac79492012-01-14 13:34:47 -05007742 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007743 return NULL;
7744 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007745
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 if (code_page < 0) {
7747 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7748 return NULL;
7749 }
7750
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 return PyBytes_FromStringAndSize(NULL, 0);
7753
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 offset = 0;
7755 do
7756 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007758 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 chunks. */
7760 if (len > INT_MAX/2) {
7761 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 done = 0;
7763 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007766 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 done = 1;
7769 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007770
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 errors);
7774 if (ret == -2)
7775 ret = encode_code_page_errors(code_page, &outbytes,
7776 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 if (ret < 0) {
7779 Py_XDECREF(outbytes);
7780 return NULL;
7781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007785 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 return outbytes;
7788}
7789
7790PyObject *
7791PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7792 Py_ssize_t size,
7793 const char *errors)
7794{
Victor Stinner7581cef2011-11-03 22:32:33 +01007795 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007796 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007797 if (unicode == NULL)
7798 return NULL;
7799 res = encode_code_page(CP_ACP, unicode, errors);
7800 Py_DECREF(unicode);
7801 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007802}
7803
7804PyObject *
7805PyUnicode_EncodeCodePage(int code_page,
7806 PyObject *unicode,
7807 const char *errors)
7808{
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007810}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007811
Alexander Belopolsky40018472011-02-26 01:02:56 +00007812PyObject *
7813PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007814{
Victor Stinner7581cef2011-11-03 22:32:33 +01007815 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007816}
7817
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007818#undef NEED_RETRY
7819
Steve Dowercc16be82016-09-08 10:35:16 -07007820#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007821
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822/* --- Character Mapping Codec -------------------------------------------- */
7823
Victor Stinnerfb161b12013-04-18 01:44:27 +02007824static int
7825charmap_decode_string(const char *s,
7826 Py_ssize_t size,
7827 PyObject *mapping,
7828 const char *errors,
7829 _PyUnicodeWriter *writer)
7830{
7831 const char *starts = s;
7832 const char *e;
7833 Py_ssize_t startinpos, endinpos;
7834 PyObject *errorHandler = NULL, *exc = NULL;
7835 Py_ssize_t maplen;
7836 enum PyUnicode_Kind mapkind;
7837 void *mapdata;
7838 Py_UCS4 x;
7839 unsigned char ch;
7840
7841 if (PyUnicode_READY(mapping) == -1)
7842 return -1;
7843
7844 maplen = PyUnicode_GET_LENGTH(mapping);
7845 mapdata = PyUnicode_DATA(mapping);
7846 mapkind = PyUnicode_KIND(mapping);
7847
7848 e = s + size;
7849
7850 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7851 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7852 * is disabled in encoding aliases, latin1 is preferred because
7853 * its implementation is faster. */
7854 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7855 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7856 Py_UCS4 maxchar = writer->maxchar;
7857
7858 assert (writer->kind == PyUnicode_1BYTE_KIND);
7859 while (s < e) {
7860 ch = *s;
7861 x = mapdata_ucs1[ch];
7862 if (x > maxchar) {
7863 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7864 goto onError;
7865 maxchar = writer->maxchar;
7866 outdata = (Py_UCS1 *)writer->data;
7867 }
7868 outdata[writer->pos] = x;
7869 writer->pos++;
7870 ++s;
7871 }
7872 return 0;
7873 }
7874
7875 while (s < e) {
7876 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7877 enum PyUnicode_Kind outkind = writer->kind;
7878 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7879 if (outkind == PyUnicode_1BYTE_KIND) {
7880 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7881 Py_UCS4 maxchar = writer->maxchar;
7882 while (s < e) {
7883 ch = *s;
7884 x = mapdata_ucs2[ch];
7885 if (x > maxchar)
7886 goto Error;
7887 outdata[writer->pos] = x;
7888 writer->pos++;
7889 ++s;
7890 }
7891 break;
7892 }
7893 else if (outkind == PyUnicode_2BYTE_KIND) {
7894 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7895 while (s < e) {
7896 ch = *s;
7897 x = mapdata_ucs2[ch];
7898 if (x == 0xFFFE)
7899 goto Error;
7900 outdata[writer->pos] = x;
7901 writer->pos++;
7902 ++s;
7903 }
7904 break;
7905 }
7906 }
7907 ch = *s;
7908
7909 if (ch < maplen)
7910 x = PyUnicode_READ(mapkind, mapdata, ch);
7911 else
7912 x = 0xfffe; /* invalid value */
7913Error:
7914 if (x == 0xfffe)
7915 {
7916 /* undefined mapping */
7917 startinpos = s-starts;
7918 endinpos = startinpos+1;
7919 if (unicode_decode_call_errorhandler_writer(
7920 errors, &errorHandler,
7921 "charmap", "character maps to <undefined>",
7922 &starts, &e, &startinpos, &endinpos, &exc, &s,
7923 writer)) {
7924 goto onError;
7925 }
7926 continue;
7927 }
7928
7929 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7930 goto onError;
7931 ++s;
7932 }
7933 Py_XDECREF(errorHandler);
7934 Py_XDECREF(exc);
7935 return 0;
7936
7937onError:
7938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
7940 return -1;
7941}
7942
7943static int
7944charmap_decode_mapping(const char *s,
7945 Py_ssize_t size,
7946 PyObject *mapping,
7947 const char *errors,
7948 _PyUnicodeWriter *writer)
7949{
7950 const char *starts = s;
7951 const char *e;
7952 Py_ssize_t startinpos, endinpos;
7953 PyObject *errorHandler = NULL, *exc = NULL;
7954 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007955 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007956
7957 e = s + size;
7958
7959 while (s < e) {
7960 ch = *s;
7961
7962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7963 key = PyLong_FromLong((long)ch);
7964 if (key == NULL)
7965 goto onError;
7966
7967 item = PyObject_GetItem(mapping, key);
7968 Py_DECREF(key);
7969 if (item == NULL) {
7970 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7971 /* No mapping found means: mapping is undefined. */
7972 PyErr_Clear();
7973 goto Undefined;
7974 } else
7975 goto onError;
7976 }
7977
7978 /* Apply mapping */
7979 if (item == Py_None)
7980 goto Undefined;
7981 if (PyLong_Check(item)) {
7982 long value = PyLong_AS_LONG(item);
7983 if (value == 0xFFFE)
7984 goto Undefined;
7985 if (value < 0 || value > MAX_UNICODE) {
7986 PyErr_Format(PyExc_TypeError,
7987 "character mapping must be in range(0x%lx)",
7988 (unsigned long)MAX_UNICODE + 1);
7989 goto onError;
7990 }
7991
7992 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7993 goto onError;
7994 }
7995 else if (PyUnicode_Check(item)) {
7996 if (PyUnicode_READY(item) == -1)
7997 goto onError;
7998 if (PyUnicode_GET_LENGTH(item) == 1) {
7999 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8000 if (value == 0xFFFE)
8001 goto Undefined;
8002 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8003 goto onError;
8004 }
8005 else {
8006 writer->overallocate = 1;
8007 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8008 goto onError;
8009 }
8010 }
8011 else {
8012 /* wrong return value */
8013 PyErr_SetString(PyExc_TypeError,
8014 "character mapping must return integer, None or str");
8015 goto onError;
8016 }
8017 Py_CLEAR(item);
8018 ++s;
8019 continue;
8020
8021Undefined:
8022 /* undefined mapping */
8023 Py_CLEAR(item);
8024 startinpos = s-starts;
8025 endinpos = startinpos+1;
8026 if (unicode_decode_call_errorhandler_writer(
8027 errors, &errorHandler,
8028 "charmap", "character maps to <undefined>",
8029 &starts, &e, &startinpos, &endinpos, &exc, &s,
8030 writer)) {
8031 goto onError;
8032 }
8033 }
8034 Py_XDECREF(errorHandler);
8035 Py_XDECREF(exc);
8036 return 0;
8037
8038onError:
8039 Py_XDECREF(item);
8040 Py_XDECREF(errorHandler);
8041 Py_XDECREF(exc);
8042 return -1;
8043}
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045PyObject *
8046PyUnicode_DecodeCharmap(const char *s,
8047 Py_ssize_t size,
8048 PyObject *mapping,
8049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008051 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008052
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 /* Default to Latin-1 */
8054 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008059 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008060 writer.min_length = size;
8061 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008063
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008064 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008065 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8066 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008067 }
8068 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008069 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008072 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008073
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008075 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return NULL;
8077}
8078
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079/* Charmap encoding: the lookup table */
8080
Alexander Belopolsky40018472011-02-26 01:02:56 +00008081struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 PyObject_HEAD
8083 unsigned char level1[32];
8084 int count2, count3;
8085 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086};
8087
8088static PyObject*
8089encoding_map_size(PyObject *obj, PyObject* args)
8090{
8091 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094}
8095
8096static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 PyDoc_STR("Return the size (in bytes) of this object") },
8099 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100};
8101
8102static void
8103encoding_map_dealloc(PyObject* o)
8104{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106}
8107
8108static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 "EncodingMap", /*tp_name*/
8111 sizeof(struct encoding_map), /*tp_basicsize*/
8112 0, /*tp_itemsize*/
8113 /* methods */
8114 encoding_map_dealloc, /*tp_dealloc*/
8115 0, /*tp_print*/
8116 0, /*tp_getattr*/
8117 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008118 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 0, /*tp_repr*/
8120 0, /*tp_as_number*/
8121 0, /*tp_as_sequence*/
8122 0, /*tp_as_mapping*/
8123 0, /*tp_hash*/
8124 0, /*tp_call*/
8125 0, /*tp_str*/
8126 0, /*tp_getattro*/
8127 0, /*tp_setattro*/
8128 0, /*tp_as_buffer*/
8129 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8130 0, /*tp_doc*/
8131 0, /*tp_traverse*/
8132 0, /*tp_clear*/
8133 0, /*tp_richcompare*/
8134 0, /*tp_weaklistoffset*/
8135 0, /*tp_iter*/
8136 0, /*tp_iternext*/
8137 encoding_map_methods, /*tp_methods*/
8138 0, /*tp_members*/
8139 0, /*tp_getset*/
8140 0, /*tp_base*/
8141 0, /*tp_dict*/
8142 0, /*tp_descr_get*/
8143 0, /*tp_descr_set*/
8144 0, /*tp_dictoffset*/
8145 0, /*tp_init*/
8146 0, /*tp_alloc*/
8147 0, /*tp_new*/
8148 0, /*tp_free*/
8149 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150};
8151
8152PyObject*
8153PyUnicode_BuildEncodingMap(PyObject* string)
8154{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 PyObject *result;
8156 struct encoding_map *mresult;
8157 int i;
8158 int need_dict = 0;
8159 unsigned char level1[32];
8160 unsigned char level2[512];
8161 unsigned char *mlevel1, *mlevel2, *mlevel3;
8162 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 int kind;
8164 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008165 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 PyErr_BadArgument();
8170 return NULL;
8171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 kind = PyUnicode_KIND(string);
8173 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008174 length = PyUnicode_GET_LENGTH(string);
8175 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 memset(level1, 0xFF, sizeof level1);
8177 memset(level2, 0xFF, sizeof level2);
8178
8179 /* If there isn't a one-to-one mapping of NULL to \0,
8180 or if there are non-BMP characters, we need to use
8181 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008184 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 ch = PyUnicode_READ(kind, data, i);
8187 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188 need_dict = 1;
8189 break;
8190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 /* unmapped character */
8193 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 l1 = ch >> 11;
8195 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 if (level1[l1] == 0xFF)
8197 level1[l1] = count2++;
8198 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 }
8201
8202 if (count2 >= 0xFF || count3 >= 0xFF)
8203 need_dict = 1;
8204
8205 if (need_dict) {
8206 PyObject *result = PyDict_New();
8207 PyObject *key, *value;
8208 if (!result)
8209 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 if (!key || !value)
8214 goto failed1;
8215 if (PyDict_SetItem(result, key, value) == -1)
8216 goto failed1;
8217 Py_DECREF(key);
8218 Py_DECREF(value);
8219 }
8220 return result;
8221 failed1:
8222 Py_XDECREF(key);
8223 Py_XDECREF(value);
8224 Py_DECREF(result);
8225 return NULL;
8226 }
8227
8228 /* Create a three-level trie */
8229 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8230 16*count2 + 128*count3 - 1);
8231 if (!result)
8232 return PyErr_NoMemory();
8233 PyObject_Init(result, &EncodingMapType);
8234 mresult = (struct encoding_map*)result;
8235 mresult->count2 = count2;
8236 mresult->count3 = count3;
8237 mlevel1 = mresult->level1;
8238 mlevel2 = mresult->level23;
8239 mlevel3 = mresult->level23 + 16*count2;
8240 memcpy(mlevel1, level1, 32);
8241 memset(mlevel2, 0xFF, 16*count2);
8242 memset(mlevel3, 0, 128*count3);
8243 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8247 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248 /* unmapped character */
8249 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008250 o1 = ch>>11;
8251 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252 i2 = 16*mlevel1[o1] + o2;
8253 if (mlevel2[i2] == 0xFF)
8254 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008255 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256 i3 = 128*mlevel2[i2] + o3;
8257 mlevel3[i3] = i;
8258 }
8259 return result;
8260}
8261
8262static int
Victor Stinner22168992011-11-20 17:09:18 +01008263encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264{
8265 struct encoding_map *map = (struct encoding_map*)mapping;
8266 int l1 = c>>11;
8267 int l2 = (c>>7) & 0xF;
8268 int l3 = c & 0x7F;
8269 int i;
8270
Victor Stinner22168992011-11-20 17:09:18 +01008271 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 if (c == 0)
8274 return 0;
8275 /* level 1*/
8276 i = map->level1[l1];
8277 if (i == 0xFF) {
8278 return -1;
8279 }
8280 /* level 2*/
8281 i = map->level23[16*i+l2];
8282 if (i == 0xFF) {
8283 return -1;
8284 }
8285 /* level 3 */
8286 i = map->level23[16*map->count2 + 128*i + l3];
8287 if (i == 0) {
8288 return -1;
8289 }
8290 return i;
8291}
8292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293/* Lookup the character ch in the mapping. If the character
8294 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008295 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008296static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008297charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
Christian Heimes217cfd12007-12-02 14:31:20 +00008299 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 PyObject *x;
8301
8302 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 x = PyObject_GetItem(mapping, w);
8305 Py_DECREF(w);
8306 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8308 /* No mapping found means: mapping is undefined. */
8309 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008310 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 } else
8312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008314 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008316 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 long value = PyLong_AS_LONG(x);
8318 if (value < 0 || value > 255) {
8319 PyErr_SetString(PyExc_TypeError,
8320 "character mapping must be in range(256)");
8321 Py_DECREF(x);
8322 return NULL;
8323 }
8324 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 /* wrong return value */
8330 PyErr_Format(PyExc_TypeError,
8331 "character mapping must return integer, bytes or None, not %.400s",
8332 x->ob_type->tp_name);
8333 Py_DECREF(x);
8334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 }
8336}
8337
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008339charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008341 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8342 /* exponentially overallocate to minimize reallocations */
8343 if (requiredsize < 2*outsize)
8344 requiredsize = 2*outsize;
8345 if (_PyBytes_Resize(outobj, requiredsize))
8346 return -1;
8347 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348}
8349
Benjamin Peterson14339b62009-01-31 16:36:08 +00008350typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008354 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 space is available. Return a new reference to the object that
8356 was put in the output buffer, or Py_None, if the mapping was undefined
8357 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008358 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008360charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 PyObject *rep;
8364 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008365 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366
Christian Heimes90aa7642007-12-19 02:45:37 +00008367 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 if (res == -1)
8371 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 if (outsize<requiredsize)
8373 if (charmapencode_resize(outobj, outpos, requiredsize))
8374 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 outstart[(*outpos)++] = (char)res;
8377 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 }
8379
8380 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(rep);
8385 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 if (PyLong_Check(rep)) {
8388 Py_ssize_t requiredsize = *outpos+1;
8389 if (outsize<requiredsize)
8390 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8391 Py_DECREF(rep);
8392 return enc_EXCEPTION;
8393 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008394 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 else {
8398 const char *repchars = PyBytes_AS_STRING(rep);
8399 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8400 Py_ssize_t requiredsize = *outpos+repsize;
8401 if (outsize<requiredsize)
8402 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8403 Py_DECREF(rep);
8404 return enc_EXCEPTION;
8405 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008406 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 memcpy(outstart + *outpos, repchars, repsize);
8408 *outpos += repsize;
8409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411 Py_DECREF(rep);
8412 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413}
8414
8415/* handle an error in PyUnicode_EncodeCharmap
8416 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008417static int
8418charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008421 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008422 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423{
8424 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008427 enum PyUnicode_Kind kind;
8428 void *data;
8429 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t collstartpos = *inpos;
8432 Py_ssize_t collendpos = *inpos+1;
8433 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008434 const char *encoding = "charmap";
8435 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008436 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008437 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008438 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439
Benjamin Petersonbac79492012-01-14 13:34:47 -05008440 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 return -1;
8442 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 /* find all unencodable characters */
8444 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008446 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008447 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008448 val = encoding_map_lookup(ch, mapping);
8449 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 break;
8451 ++collendpos;
8452 continue;
8453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008455 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8456 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (rep==NULL)
8458 return -1;
8459 else if (rep!=Py_None) {
8460 Py_DECREF(rep);
8461 break;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 }
8466 /* cache callback name lookup
8467 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008468 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008469 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008470
8471 switch (*error_handler) {
8472 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008473 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008474 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008475
8476 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 x = charmapencode_output('?', mapping, res, respos);
8479 if (x==enc_EXCEPTION) {
8480 return -1;
8481 }
8482 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008483 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 return -1;
8485 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 }
8487 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008488 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 *inpos = collendpos;
8490 break;
Victor Stinner50149202015-09-22 00:26:54 +02008491
8492 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 /* generate replacement (temporarily (mis)uses p) */
8494 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 char buffer[2+29+1+1];
8496 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 for (cp = buffer; *cp; ++cp) {
8499 x = charmapencode_output(*cp, mapping, res, respos);
8500 if (x==enc_EXCEPTION)
8501 return -1;
8502 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008503 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return -1;
8505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 }
8507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 *inpos = collendpos;
8509 break;
Victor Stinner50149202015-09-22 00:26:54 +02008510
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 default:
Victor Stinner50149202015-09-22 00:26:54 +02008512 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008515 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008517 if (PyBytes_Check(repunicode)) {
8518 /* Directly copy bytes result to output. */
8519 Py_ssize_t outsize = PyBytes_Size(*res);
8520 Py_ssize_t requiredsize;
8521 repsize = PyBytes_Size(repunicode);
8522 requiredsize = *respos + repsize;
8523 if (requiredsize > outsize)
8524 /* Make room for all additional bytes. */
8525 if (charmapencode_resize(res, respos, requiredsize)) {
8526 Py_DECREF(repunicode);
8527 return -1;
8528 }
8529 memcpy(PyBytes_AsString(*res) + *respos,
8530 PyBytes_AsString(repunicode), repsize);
8531 *respos += repsize;
8532 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008533 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008534 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008535 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008536 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008537 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008538 Py_DECREF(repunicode);
8539 return -1;
8540 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008541 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008542 data = PyUnicode_DATA(repunicode);
8543 kind = PyUnicode_KIND(repunicode);
8544 for (index = 0; index < repsize; index++) {
8545 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8546 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008548 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
8550 }
8551 else if (x==enc_FAILED) {
8552 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008553 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 return -1;
8555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 }
8557 *inpos = newpos;
8558 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 }
8560 return 0;
8561}
8562
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564_PyUnicode_EncodeCharmap(PyObject *unicode,
8565 PyObject *mapping,
8566 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 /* output object */
8569 PyObject *res = NULL;
8570 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008571 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008574 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008575 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008577 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008578 void *data;
8579 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
Benjamin Petersonbac79492012-01-14 13:34:47 -05008581 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582 return NULL;
8583 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008584 data = PyUnicode_DATA(unicode);
8585 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008586
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 /* Default to Latin-1 */
8588 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008591 /* allocate enough for a simple encoding without
8592 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008593 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 if (res == NULL)
8595 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008596 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008600 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 if (x==enc_EXCEPTION) /* error */
8604 goto onError;
8605 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008608 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 &res, &respos)) {
8610 goto onError;
8611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 else
8614 /* done with this character => adjust input position */
8615 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008619 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008620 if (_PyBytes_Resize(&res, respos) < 0)
8621 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008624 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 return res;
8626
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 Py_XDECREF(res);
8629 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008630 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 return NULL;
8632}
8633
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008634/* Deprecated */
8635PyObject *
8636PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8637 Py_ssize_t size,
8638 PyObject *mapping,
8639 const char *errors)
8640{
8641 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008642 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008643 if (unicode == NULL)
8644 return NULL;
8645 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8646 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008647 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648}
8649
Alexander Belopolsky40018472011-02-26 01:02:56 +00008650PyObject *
8651PyUnicode_AsCharmapString(PyObject *unicode,
8652 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653{
8654 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 PyErr_BadArgument();
8656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659}
8660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008662static void
8663make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665 Py_ssize_t startpos, Py_ssize_t endpos,
8666 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 *exceptionObject = _PyUnicodeTranslateError_Create(
8670 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8674 goto onError;
8675 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8676 goto onError;
8677 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8678 goto onError;
8679 return;
8680 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008681 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683}
8684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685/* error handling callback helper:
8686 build arguments, call the callback and check the arguments,
8687 put the result into newpos and return the replacement string, which
8688 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689static PyObject *
8690unicode_translate_call_errorhandler(const char *errors,
8691 PyObject **errorHandler,
8692 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694 Py_ssize_t startpos, Py_ssize_t endpos,
8695 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008697 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008699 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 PyObject *restuple;
8701 PyObject *resunicode;
8702
8703 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 }
8708
8709 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008714 restuple = PyObject_CallFunctionObjArgs(
8715 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008719 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 Py_DECREF(restuple);
8721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008723 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 &resunicode, &i_newpos)) {
8725 Py_DECREF(restuple);
8726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008728 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 else
8731 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008733 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 Py_DECREF(restuple);
8735 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 Py_INCREF(resunicode);
8738 Py_DECREF(restuple);
8739 return resunicode;
8740}
8741
8742/* Lookup the character ch in the mapping and put the result in result,
8743 which must be decrefed by the caller.
8744 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747{
Christian Heimes217cfd12007-12-02 14:31:20 +00008748 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 PyObject *x;
8750
8751 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753 x = PyObject_GetItem(mapping, w);
8754 Py_DECREF(w);
8755 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8757 /* No mapping found means: use 1:1 mapping. */
8758 PyErr_Clear();
8759 *result = NULL;
8760 return 0;
8761 } else
8762 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 }
8764 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 *result = x;
8766 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008768 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008770 if (value < 0 || value > MAX_UNICODE) {
8771 PyErr_Format(PyExc_ValueError,
8772 "character mapping must be in range(0x%x)",
8773 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 Py_DECREF(x);
8775 return -1;
8776 }
8777 *result = x;
8778 return 0;
8779 }
8780 else if (PyUnicode_Check(x)) {
8781 *result = x;
8782 return 0;
8783 }
8784 else {
8785 /* wrong return value */
8786 PyErr_SetString(PyExc_TypeError,
8787 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788 Py_DECREF(x);
8789 return -1;
8790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791}
Victor Stinner1194ea02014-04-04 19:37:40 +02008792
8793/* lookup the character, write the result into the writer.
8794 Return 1 if the result was written into the writer, return 0 if the mapping
8795 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008796static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008797charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8798 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008799{
Victor Stinner1194ea02014-04-04 19:37:40 +02008800 PyObject *item;
8801
8802 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008804
8805 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008807 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008810 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008812
8813 if (item == Py_None) {
8814 Py_DECREF(item);
8815 return 0;
8816 }
8817
8818 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008819 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8820 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8821 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008822 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8823 Py_DECREF(item);
8824 return -1;
8825 }
8826 Py_DECREF(item);
8827 return 1;
8828 }
8829
8830 if (!PyUnicode_Check(item)) {
8831 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008833 }
8834
8835 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8836 Py_DECREF(item);
8837 return -1;
8838 }
8839
8840 Py_DECREF(item);
8841 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842}
8843
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844static int
8845unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8846 Py_UCS1 *translate)
8847{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008848 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 int ret = 0;
8850
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851 if (charmaptranslate_lookup(ch, mapping, &item)) {
8852 return -1;
8853 }
8854
8855 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008857 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008859 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 /* not found => default to 1:1 mapping */
8861 translate[ch] = ch;
8862 return 1;
8863 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008864 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008865 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008866 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8867 used it */
8868 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869 /* invalid character or character outside ASCII:
8870 skip the fast translate */
8871 goto exit;
8872 }
8873 translate[ch] = (Py_UCS1)replace;
8874 }
8875 else if (PyUnicode_Check(item)) {
8876 Py_UCS4 replace;
8877
8878 if (PyUnicode_READY(item) == -1) {
8879 Py_DECREF(item);
8880 return -1;
8881 }
8882 if (PyUnicode_GET_LENGTH(item) != 1)
8883 goto exit;
8884
8885 replace = PyUnicode_READ_CHAR(item, 0);
8886 if (replace > 127)
8887 goto exit;
8888 translate[ch] = (Py_UCS1)replace;
8889 }
8890 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008891 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 goto exit;
8893 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 ret = 1;
8895
Benjamin Peterson1365de72014-04-07 20:15:41 -04008896 exit:
8897 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 return ret;
8899}
8900
8901/* Fast path for ascii => ascii translation. Return 1 if the whole string
8902 was translated into writer, return 0 if the input string was partially
8903 translated into writer, raise an exception and return -1 on error. */
8904static int
8905unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008906 _PyUnicodeWriter *writer, int ignore,
8907 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908{
Victor Stinner872b2912014-04-05 14:27:07 +02008909 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 Py_ssize_t len;
8911 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008912 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 len = PyUnicode_GET_LENGTH(input);
8915
Victor Stinner872b2912014-04-05 14:27:07 +02008916 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917
8918 in = PyUnicode_1BYTE_DATA(input);
8919 end = in + len;
8920
8921 assert(PyUnicode_IS_ASCII(writer->buffer));
8922 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8923 out = PyUnicode_1BYTE_DATA(writer->buffer);
8924
Victor Stinner872b2912014-04-05 14:27:07 +02008925 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008927 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008929 int translate = unicode_fast_translate_lookup(mapping, ch,
8930 ascii_table);
8931 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008932 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008933 if (translate == 0)
8934 goto exit;
8935 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 }
Victor Stinner872b2912014-04-05 14:27:07 +02008937 if (ch2 == 0xfe) {
8938 if (ignore)
8939 continue;
8940 goto exit;
8941 }
8942 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008943 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008944 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008945 }
Victor Stinner872b2912014-04-05 14:27:07 +02008946 res = 1;
8947
8948exit:
8949 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008950 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008951 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008952}
8953
Victor Stinner3222da22015-10-01 22:07:32 +02008954static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955_PyUnicode_TranslateCharmap(PyObject *input,
8956 PyObject *mapping,
8957 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 Py_ssize_t size, i;
8962 int kind;
8963 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 _PyUnicodeWriter writer;
8965 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008966 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008967 PyObject *errorHandler = NULL;
8968 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadArgument();
8974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 if (PyUnicode_READY(input) == -1)
8978 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 kind = PyUnicode_KIND(input);
8981 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008983 if (size == 0)
8984 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 /* allocate enough for a simple 1:1 translation without
8987 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 _PyUnicodeWriter_Init(&writer);
8989 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
Victor Stinner872b2912014-04-05 14:27:07 +02008992 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8993
Victor Stinner33798672016-03-01 21:59:58 +01008994 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008995 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008996 if (PyUnicode_IS_ASCII(input)) {
8997 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8998 if (res < 0) {
8999 _PyUnicodeWriter_Dealloc(&writer);
9000 return NULL;
9001 }
9002 if (res == 1)
9003 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009004 }
Victor Stinner33798672016-03-01 21:59:58 +01009005 else {
9006 i = 0;
9007 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 int translate;
9012 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9013 Py_ssize_t newpos;
9014 /* startpos for collecting untranslatable chars */
9015 Py_ssize_t collstart;
9016 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018
Victor Stinner1194ea02014-04-04 19:37:40 +02009019 ch = PyUnicode_READ(kind, data, i);
9020 translate = charmaptranslate_output(ch, mapping, &writer);
9021 if (translate < 0)
9022 goto onError;
9023
9024 if (translate != 0) {
9025 /* it worked => adjust input pointer */
9026 ++i;
9027 continue;
9028 }
9029
9030 /* untranslatable character */
9031 collstart = i;
9032 collend = i+1;
9033
9034 /* find all untranslatable characters */
9035 while (collend < size) {
9036 PyObject *x;
9037 ch = PyUnicode_READ(kind, data, collend);
9038 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009039 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 Py_XDECREF(x);
9041 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009043 ++collend;
9044 }
9045
9046 if (ignore) {
9047 i = collend;
9048 }
9049 else {
9050 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9051 reason, input, &exc,
9052 collstart, collend, &newpos);
9053 if (repunicode == NULL)
9054 goto onError;
9055 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009057 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 Py_DECREF(repunicode);
9060 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009061 }
9062 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 Py_XDECREF(exc);
9064 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009065 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009068 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009069 Py_XDECREF(exc);
9070 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 return NULL;
9072}
9073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074/* Deprecated. Use PyUnicode_Translate instead. */
9075PyObject *
9076PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9077 Py_ssize_t size,
9078 PyObject *mapping,
9079 const char *errors)
9080{
Christian Heimes5f520f42012-09-11 14:03:25 +02009081 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009082 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 if (!unicode)
9084 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009085 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9086 Py_DECREF(unicode);
9087 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088}
9089
Alexander Belopolsky40018472011-02-26 01:02:56 +00009090PyObject *
9091PyUnicode_Translate(PyObject *str,
9092 PyObject *mapping,
9093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009095 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009096 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009097 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098}
Tim Petersced69f82003-09-16 20:30:58 +00009099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100PyObject *
9101_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9102{
9103 if (!PyUnicode_Check(unicode)) {
9104 PyErr_BadInternalCall();
9105 return NULL;
9106 }
9107 if (PyUnicode_READY(unicode) == -1)
9108 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009109 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 /* If the string is already ASCII, just return the same string */
9111 Py_INCREF(unicode);
9112 return unicode;
9113 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009114
9115 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9116 PyObject *result = PyUnicode_New(len, 127);
9117 if (result == NULL) {
9118 return NULL;
9119 }
9120
9121 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9122 int kind = PyUnicode_KIND(unicode);
9123 const void *data = PyUnicode_DATA(unicode);
9124 Py_ssize_t i;
9125 for (i = 0; i < len; ++i) {
9126 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9127 if (ch < 127) {
9128 out[i] = ch;
9129 }
9130 else if (Py_UNICODE_ISSPACE(ch)) {
9131 out[i] = ' ';
9132 }
9133 else {
9134 int decimal = Py_UNICODE_TODECIMAL(ch);
9135 if (decimal < 0) {
9136 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009137 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009138 _PyUnicode_LENGTH(result) = i + 1;
9139 break;
9140 }
9141 out[i] = '0' + decimal;
9142 }
9143 }
9144
INADA Naoki16dfca42018-07-14 12:06:43 +09009145 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009146 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147}
9148
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009149PyObject *
9150PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9151 Py_ssize_t length)
9152{
Victor Stinnerf0124502011-11-21 23:12:56 +01009153 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009155 Py_UCS4 maxchar;
9156 enum PyUnicode_Kind kind;
9157 void *data;
9158
Victor Stinner99d7ad02012-02-22 13:37:39 +01009159 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009160 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009161 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009162 if (ch > 127) {
9163 int decimal = Py_UNICODE_TODECIMAL(ch);
9164 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009165 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009166 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009167 }
9168 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009169
9170 /* Copy to a new string */
9171 decimal = PyUnicode_New(length, maxchar);
9172 if (decimal == NULL)
9173 return decimal;
9174 kind = PyUnicode_KIND(decimal);
9175 data = PyUnicode_DATA(decimal);
9176 /* Iterate over code points */
9177 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009178 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009179 if (ch > 127) {
9180 int decimal = Py_UNICODE_TODECIMAL(ch);
9181 if (decimal >= 0)
9182 ch = '0' + decimal;
9183 }
9184 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009186 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009187}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009188/* --- Decimal Encoder ---------------------------------------------------- */
9189
Alexander Belopolsky40018472011-02-26 01:02:56 +00009190int
9191PyUnicode_EncodeDecimal(Py_UNICODE *s,
9192 Py_ssize_t length,
9193 char *output,
9194 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009195{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009196 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009197 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009198 enum PyUnicode_Kind kind;
9199 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009200
9201 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 PyErr_BadArgument();
9203 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009204 }
9205
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009206 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 if (unicode == NULL)
9208 return -1;
9209
Victor Stinner42bf7752011-11-21 22:52:58 +01009210 kind = PyUnicode_KIND(unicode);
9211 data = PyUnicode_DATA(unicode);
9212
Victor Stinnerb84d7232011-11-22 01:50:07 +01009213 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009214 PyObject *exc;
9215 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009217 Py_ssize_t startpos;
9218
9219 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009220
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009222 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009223 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 decimal = Py_UNICODE_TODECIMAL(ch);
9227 if (decimal >= 0) {
9228 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009229 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 continue;
9231 }
9232 if (0 < ch && ch < 256) {
9233 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009234 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 continue;
9236 }
Victor Stinner6345be92011-11-25 20:09:01 +01009237
Victor Stinner42bf7752011-11-21 22:52:58 +01009238 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009239 exc = NULL;
9240 raise_encode_exception(&exc, "decimal", unicode,
9241 startpos, startpos+1,
9242 "invalid decimal Unicode string");
9243 Py_XDECREF(exc);
9244 Py_DECREF(unicode);
9245 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009246 }
9247 /* 0-terminate the output string */
9248 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009249 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009250 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009251}
9252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253/* --- Helpers ------------------------------------------------------------ */
9254
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009255/* helper macro to fixup start/end slice values */
9256#define ADJUST_INDICES(start, end, len) \
9257 if (end > len) \
9258 end = len; \
9259 else if (end < 0) { \
9260 end += len; \
9261 if (end < 0) \
9262 end = 0; \
9263 } \
9264 if (start < 0) { \
9265 start += len; \
9266 if (start < 0) \
9267 start = 0; \
9268 }
9269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009271any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009273 Py_ssize_t end,
9274 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009276 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 void *buf1, *buf2;
9278 Py_ssize_t len1, len2, result;
9279
9280 kind1 = PyUnicode_KIND(s1);
9281 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009282 if (kind1 < kind2)
9283 return -1;
9284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 len1 = PyUnicode_GET_LENGTH(s1);
9286 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009287 ADJUST_INDICES(start, end, len1);
9288 if (end - start < len2)
9289 return -1;
9290
9291 buf1 = PyUnicode_DATA(s1);
9292 buf2 = PyUnicode_DATA(s2);
9293 if (len2 == 1) {
9294 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9295 result = findchar((const char *)buf1 + kind1*start,
9296 kind1, end - start, ch, direction);
9297 if (result == -1)
9298 return -1;
9299 else
9300 return start + result;
9301 }
9302
9303 if (kind2 != kind1) {
9304 buf2 = _PyUnicode_AsKind(s2, kind1);
9305 if (!buf2)
9306 return -2;
9307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308
Victor Stinner794d5672011-10-10 03:21:36 +02009309 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009310 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009311 case PyUnicode_1BYTE_KIND:
9312 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9313 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9314 else
9315 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9316 break;
9317 case PyUnicode_2BYTE_KIND:
9318 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9319 break;
9320 case PyUnicode_4BYTE_KIND:
9321 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9322 break;
9323 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009324 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009325 }
9326 }
9327 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009328 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009329 case PyUnicode_1BYTE_KIND:
9330 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9331 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9332 else
9333 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 break;
9335 case PyUnicode_2BYTE_KIND:
9336 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 break;
9338 case PyUnicode_4BYTE_KIND:
9339 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340 break;
9341 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009342 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 }
9345
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009346 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 PyMem_Free(buf2);
9348
9349 return result;
9350}
9351
Victor Stinner59423e32018-11-26 13:40:01 +01009352/* _PyUnicode_InsertThousandsGrouping() helper functions */
9353#include "stringlib/localeutil.h"
9354
9355/**
9356 * InsertThousandsGrouping:
9357 * @writer: Unicode writer.
9358 * @n_buffer: Number of characters in @buffer.
9359 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9360 * @d_pos: Start of digits string.
9361 * @n_digits: The number of digits in the string, in which we want
9362 * to put the grouping chars.
9363 * @min_width: The minimum width of the digits in the output string.
9364 * Output will be zero-padded on the left to fill.
9365 * @grouping: see definition in localeconv().
9366 * @thousands_sep: see definition in localeconv().
9367 *
9368 * There are 2 modes: counting and filling. If @writer is NULL,
9369 * we are in counting mode, else filling mode.
9370 * If counting, the required buffer size is returned.
9371 * If filling, we know the buffer will be large enough, so we don't
9372 * need to pass in the buffer size.
9373 * Inserts thousand grouping characters (as defined by grouping and
9374 * thousands_sep) into @writer.
9375 *
9376 * Return value: -1 on error, number of characters otherwise.
9377 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009379_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009380 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009382 PyObject *digits,
9383 Py_ssize_t d_pos,
9384 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009386 const char *grouping,
9387 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389{
Xtreak3f7983a2019-01-07 20:39:14 +05309390 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009391 if (writer) {
9392 assert(digits != NULL);
9393 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 }
9395 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009396 assert(digits == NULL);
9397 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
Victor Stinner59423e32018-11-26 13:40:01 +01009399 assert(0 <= d_pos);
9400 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009401 assert(grouping != NULL);
9402
9403 if (digits != NULL) {
9404 if (PyUnicode_READY(digits) == -1) {
9405 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009406 }
Victor Stinner59423e32018-11-26 13:40:01 +01009407 }
9408 if (PyUnicode_READY(thousands_sep) == -1) {
9409 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009410 }
9411
Victor Stinner59423e32018-11-26 13:40:01 +01009412 Py_ssize_t count = 0;
9413 Py_ssize_t n_zeros;
9414 int loop_broken = 0;
9415 int use_separator = 0; /* First time through, don't append the
9416 separator. They only go between
9417 groups. */
9418 Py_ssize_t buffer_pos;
9419 Py_ssize_t digits_pos;
9420 Py_ssize_t len;
9421 Py_ssize_t n_chars;
9422 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9423 be looked at */
9424 /* A generator that returns all of the grouping widths, until it
9425 returns 0. */
9426 GroupGenerator groupgen;
9427 GroupGenerator_init(&groupgen, grouping);
9428 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9429
9430 /* if digits are not grouped, thousands separator
9431 should be an empty string */
9432 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9433
9434 digits_pos = d_pos + n_digits;
9435 if (writer) {
9436 buffer_pos = writer->pos + n_buffer;
9437 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9438 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 }
Victor Stinner59423e32018-11-26 13:40:01 +01009440 else {
9441 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009442 }
Victor Stinner59423e32018-11-26 13:40:01 +01009443
9444 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009445 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009446 }
Victor Stinner59423e32018-11-26 13:40:01 +01009447
9448 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9449 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9450 n_zeros = Py_MAX(0, len - remaining);
9451 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9452
9453 /* Use n_zero zero's and n_chars chars */
9454
9455 /* Count only, don't do anything. */
9456 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9457
9458 /* Copy into the writer. */
9459 InsertThousandsGrouping_fill(writer, &buffer_pos,
9460 digits, &digits_pos,
9461 n_chars, n_zeros,
9462 use_separator ? thousands_sep : NULL,
9463 thousands_sep_len, maxchar);
9464
9465 /* Use a separator next time. */
9466 use_separator = 1;
9467
9468 remaining -= n_chars;
9469 min_width -= len;
9470
9471 if (remaining <= 0 && min_width <= 0) {
9472 loop_broken = 1;
9473 break;
9474 }
9475 min_width -= thousands_sep_len;
9476 }
9477 if (!loop_broken) {
9478 /* We left the loop without using a break statement. */
9479
9480 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9481 n_zeros = Py_MAX(0, len - remaining);
9482 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9483
9484 /* Use n_zero zero's and n_chars chars */
9485 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9486
9487 /* Copy into the writer. */
9488 InsertThousandsGrouping_fill(writer, &buffer_pos,
9489 digits, &digits_pos,
9490 n_chars, n_zeros,
9491 use_separator ? thousands_sep : NULL,
9492 thousands_sep_len, maxchar);
9493 }
9494 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495}
9496
9497
Alexander Belopolsky40018472011-02-26 01:02:56 +00009498Py_ssize_t
9499PyUnicode_Count(PyObject *str,
9500 PyObject *substr,
9501 Py_ssize_t start,
9502 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009504 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009505 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 void *buf1 = NULL, *buf2 = NULL;
9507 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009508
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009509 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009511
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009512 kind1 = PyUnicode_KIND(str);
9513 kind2 = PyUnicode_KIND(substr);
9514 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009515 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 len1 = PyUnicode_GET_LENGTH(str);
9518 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009520 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009522
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009523 buf1 = PyUnicode_DATA(str);
9524 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009525 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009526 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009527 if (!buf2)
9528 goto onError;
9529 }
9530
9531 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009533 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009534 result = asciilib_count(
9535 ((Py_UCS1*)buf1) + start, end - start,
9536 buf2, len2, PY_SSIZE_T_MAX
9537 );
9538 else
9539 result = ucs1lib_count(
9540 ((Py_UCS1*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 break;
9544 case PyUnicode_2BYTE_KIND:
9545 result = ucs2lib_count(
9546 ((Py_UCS2*)buf1) + start, end - start,
9547 buf2, len2, PY_SSIZE_T_MAX
9548 );
9549 break;
9550 case PyUnicode_4BYTE_KIND:
9551 result = ucs4lib_count(
9552 ((Py_UCS4*)buf1) + start, end - start,
9553 buf2, len2, PY_SSIZE_T_MAX
9554 );
9555 break;
9556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009559
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009560 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 PyMem_Free(buf2);
9562
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009565 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyMem_Free(buf2);
9567 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568}
9569
Alexander Belopolsky40018472011-02-26 01:02:56 +00009570Py_ssize_t
9571PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009572 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009573 Py_ssize_t start,
9574 Py_ssize_t end,
9575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583Py_ssize_t
9584PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9585 Py_ssize_t start, Py_ssize_t end,
9586 int direction)
9587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009589 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 if (PyUnicode_READY(str) == -1)
9591 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009592 len = PyUnicode_GET_LENGTH(str);
9593 ADJUST_INDICES(start, end, len);
9594 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009595 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009597 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9598 kind, end-start, ch, direction);
9599 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009601 else
9602 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603}
9604
Alexander Belopolsky40018472011-02-26 01:02:56 +00009605static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009606tailmatch(PyObject *self,
9607 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009608 Py_ssize_t start,
9609 Py_ssize_t end,
9610 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 int kind_self;
9613 int kind_sub;
9614 void *data_self;
9615 void *data_sub;
9616 Py_ssize_t offset;
9617 Py_ssize_t i;
9618 Py_ssize_t end_sub;
9619
9620 if (PyUnicode_READY(self) == -1 ||
9621 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009622 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9625 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009629 if (PyUnicode_GET_LENGTH(substring) == 0)
9630 return 1;
9631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 kind_self = PyUnicode_KIND(self);
9633 data_self = PyUnicode_DATA(self);
9634 kind_sub = PyUnicode_KIND(substring);
9635 data_sub = PyUnicode_DATA(substring);
9636 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9637
9638 if (direction > 0)
9639 offset = end;
9640 else
9641 offset = start;
9642
9643 if (PyUnicode_READ(kind_self, data_self, offset) ==
9644 PyUnicode_READ(kind_sub, data_sub, 0) &&
9645 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9646 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9647 /* If both are of the same kind, memcmp is sufficient */
9648 if (kind_self == kind_sub) {
9649 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009650 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 data_sub,
9652 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009653 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009655 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 else {
9657 /* We do not need to compare 0 and len(substring)-1 because
9658 the if statement above ensured already that they are equal
9659 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 for (i = 1; i < end_sub; ++i) {
9661 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9662 PyUnicode_READ(kind_sub, data_sub, i))
9663 return 0;
9664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 }
9668
9669 return 0;
9670}
9671
Alexander Belopolsky40018472011-02-26 01:02:56 +00009672Py_ssize_t
9673PyUnicode_Tailmatch(PyObject *str,
9674 PyObject *substr,
9675 Py_ssize_t start,
9676 Py_ssize_t end,
9677 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009679 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009681
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009682 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683}
9684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685static PyObject *
9686ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9689 char *resdata, *data = PyUnicode_DATA(self);
9690 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009691
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009692 res = PyUnicode_New(len, 127);
9693 if (res == NULL)
9694 return NULL;
9695 resdata = PyUnicode_DATA(res);
9696 if (lower)
9697 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 _Py_bytes_upper(resdata, data, len);
9700 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701}
9702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 Py_ssize_t j;
9707 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009708 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009710
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9712
9713 where ! is a negation and \p{xxx} is a character with property xxx.
9714 */
9715 for (j = i - 1; j >= 0; j--) {
9716 c = PyUnicode_READ(kind, data, j);
9717 if (!_PyUnicode_IsCaseIgnorable(c))
9718 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9721 if (final_sigma) {
9722 for (j = i + 1; j < length; j++) {
9723 c = PyUnicode_READ(kind, data, j);
9724 if (!_PyUnicode_IsCaseIgnorable(c))
9725 break;
9726 }
9727 final_sigma = j == length || !_PyUnicode_IsCased(c);
9728 }
9729 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732static int
9733lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9734 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 /* Obscure special case. */
9737 if (c == 0x3A3) {
9738 mapped[0] = handle_capital_sigma(kind, data, length, i);
9739 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742}
9743
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744static Py_ssize_t
9745do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009747 Py_ssize_t i, k = 0;
9748 int n_res, j;
9749 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009750
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 c = PyUnicode_READ(kind, data, 0);
9752 n_res = _PyUnicode_ToUpperFull(c, mapped);
9753 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 for (i = 1; i < length; i++) {
9758 c = PyUnicode_READ(kind, data, i);
9759 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9760 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009761 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009763 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009764 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009765 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768static Py_ssize_t
9769do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9770 Py_ssize_t i, k = 0;
9771
9772 for (i = 0; i < length; i++) {
9773 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9774 int n_res, j;
9775 if (Py_UNICODE_ISUPPER(c)) {
9776 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9777 }
9778 else if (Py_UNICODE_ISLOWER(c)) {
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 }
9781 else {
9782 n_res = 1;
9783 mapped[0] = c;
9784 }
9785 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009786 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787 res[k++] = mapped[j];
9788 }
9789 }
9790 return k;
9791}
9792
9793static Py_ssize_t
9794do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9795 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797 Py_ssize_t i, k = 0;
9798
9799 for (i = 0; i < length; i++) {
9800 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9801 int n_res, j;
9802 if (lower)
9803 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9804 else
9805 n_res = _PyUnicode_ToUpperFull(c, mapped);
9806 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009807 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808 res[k++] = mapped[j];
9809 }
9810 }
9811 return k;
9812}
9813
9814static Py_ssize_t
9815do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9816{
9817 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9818}
9819
9820static Py_ssize_t
9821do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9822{
9823 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9824}
9825
Benjamin Petersone51757f2012-01-12 21:10:29 -05009826static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009827do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9828{
9829 Py_ssize_t i, k = 0;
9830
9831 for (i = 0; i < length; i++) {
9832 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9833 Py_UCS4 mapped[3];
9834 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9835 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009836 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009837 res[k++] = mapped[j];
9838 }
9839 }
9840 return k;
9841}
9842
9843static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009844do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9845{
9846 Py_ssize_t i, k = 0;
9847 int previous_is_cased;
9848
9849 previous_is_cased = 0;
9850 for (i = 0; i < length; i++) {
9851 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9852 Py_UCS4 mapped[3];
9853 int n_res, j;
9854
9855 if (previous_is_cased)
9856 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9857 else
9858 n_res = _PyUnicode_ToTitleFull(c, mapped);
9859
9860 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009861 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009862 res[k++] = mapped[j];
9863 }
9864
9865 previous_is_cased = _PyUnicode_IsCased(c);
9866 }
9867 return k;
9868}
9869
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870static PyObject *
9871case_operation(PyObject *self,
9872 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9873{
9874 PyObject *res = NULL;
9875 Py_ssize_t length, newlength = 0;
9876 int kind, outkind;
9877 void *data, *outdata;
9878 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9879
Benjamin Petersoneea48462012-01-16 14:28:50 -05009880 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881
9882 kind = PyUnicode_KIND(self);
9883 data = PyUnicode_DATA(self);
9884 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009885 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009886 PyErr_SetString(PyExc_OverflowError, "string is too long");
9887 return NULL;
9888 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009889 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009890 if (tmp == NULL)
9891 return PyErr_NoMemory();
9892 newlength = perform(kind, data, length, tmp, &maxchar);
9893 res = PyUnicode_New(newlength, maxchar);
9894 if (res == NULL)
9895 goto leave;
9896 tmpend = tmp + newlength;
9897 outdata = PyUnicode_DATA(res);
9898 outkind = PyUnicode_KIND(res);
9899 switch (outkind) {
9900 case PyUnicode_1BYTE_KIND:
9901 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9902 break;
9903 case PyUnicode_2BYTE_KIND:
9904 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9905 break;
9906 case PyUnicode_4BYTE_KIND:
9907 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9908 break;
9909 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009910 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009911 }
9912 leave:
9913 PyMem_FREE(tmp);
9914 return res;
9915}
9916
Tim Peters8ce9f162004-08-27 01:49:32 +00009917PyObject *
9918PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009920 PyObject *res;
9921 PyObject *fseq;
9922 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009925 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009928 }
9929
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009930 /* NOTE: the following code can't call back into Python code,
9931 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009932 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009933
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009934 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009935 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009936 res = _PyUnicode_JoinArray(separator, items, seqlen);
9937 Py_DECREF(fseq);
9938 return res;
9939}
9940
9941PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009942_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009943{
9944 PyObject *res = NULL; /* the result */
9945 PyObject *sep = NULL;
9946 Py_ssize_t seplen;
9947 PyObject *item;
9948 Py_ssize_t sz, i, res_offset;
9949 Py_UCS4 maxchar;
9950 Py_UCS4 item_maxchar;
9951 int use_memcpy;
9952 unsigned char *res_data = NULL, *sep_data = NULL;
9953 PyObject *last_obj;
9954 unsigned int kind = 0;
9955
Tim Peters05eba1f2004-08-27 21:32:02 +00009956 /* If empty sequence, return u"". */
9957 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009958 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009959 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009960
Tim Peters05eba1f2004-08-27 21:32:02 +00009961 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009962 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009963 if (seqlen == 1) {
9964 if (PyUnicode_CheckExact(items[0])) {
9965 res = items[0];
9966 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009967 return res;
9968 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009969 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009970 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009971 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009972 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009973 /* Set up sep and seplen */
9974 if (separator == NULL) {
9975 /* fall back to a blank space separator */
9976 sep = PyUnicode_FromOrdinal(' ');
9977 if (!sep)
9978 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009980 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009981 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 else {
9983 if (!PyUnicode_Check(separator)) {
9984 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009985 "separator: expected str instance,"
9986 " %.80s found",
9987 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009988 goto onError;
9989 }
9990 if (PyUnicode_READY(separator))
9991 goto onError;
9992 sep = separator;
9993 seplen = PyUnicode_GET_LENGTH(separator);
9994 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9995 /* inc refcount to keep this code path symmetric with the
9996 above case of a blank separator */
9997 Py_INCREF(sep);
9998 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009999 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010000 }
10001
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 /* There are at least two things to join, or else we have a subclass
10003 * of str in the sequence.
10004 * Do a pre-pass to figure out the total amount of space we'll
10005 * need (sz), and see whether all argument are strings.
10006 */
10007 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008#ifdef Py_DEBUG
10009 use_memcpy = 0;
10010#else
10011 use_memcpy = 1;
10012#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010014 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 if (!PyUnicode_Check(item)) {
10017 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010018 "sequence item %zd: expected str instance,"
10019 " %.80s found",
10020 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 goto onError;
10022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 if (PyUnicode_READY(item) == -1)
10024 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010025 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010027 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010028 if (i != 0) {
10029 add_sz += seplen;
10030 }
10031 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010032 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010034 goto onError;
10035 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010036 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 if (use_memcpy && last_obj != NULL) {
10038 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10039 use_memcpy = 0;
10040 }
10041 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 }
Tim Petersced69f82003-09-16 20:30:58 +000010043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010045 if (res == NULL)
10046 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010047
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010048 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010049#ifdef Py_DEBUG
10050 use_memcpy = 0;
10051#else
10052 if (use_memcpy) {
10053 res_data = PyUnicode_1BYTE_DATA(res);
10054 kind = PyUnicode_KIND(res);
10055 if (seplen != 0)
10056 sep_data = PyUnicode_1BYTE_DATA(sep);
10057 }
10058#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010059 if (use_memcpy) {
10060 for (i = 0; i < seqlen; ++i) {
10061 Py_ssize_t itemlen;
10062 item = items[i];
10063
10064 /* Copy item, and maybe the separator. */
10065 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010066 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010067 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010068 kind * seplen);
10069 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010071
10072 itemlen = PyUnicode_GET_LENGTH(item);
10073 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010074 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010076 kind * itemlen);
10077 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010078 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010079 }
10080 assert(res_data == PyUnicode_1BYTE_DATA(res)
10081 + kind * PyUnicode_GET_LENGTH(res));
10082 }
10083 else {
10084 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10085 Py_ssize_t itemlen;
10086 item = items[i];
10087
10088 /* Copy item, and maybe the separator. */
10089 if (i && seplen != 0) {
10090 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10091 res_offset += seplen;
10092 }
10093
10094 itemlen = PyUnicode_GET_LENGTH(item);
10095 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010096 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010097 res_offset += itemlen;
10098 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010099 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010100 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010101 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010104 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010109 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 return NULL;
10111}
10112
Victor Stinnerd3f08822012-05-29 12:57:52 +020010113void
10114_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10115 Py_UCS4 fill_char)
10116{
10117 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010118 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119 assert(PyUnicode_IS_READY(unicode));
10120 assert(unicode_modifiable(unicode));
10121 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10122 assert(start >= 0);
10123 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010124 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010125}
10126
Victor Stinner3fe55312012-01-04 00:33:50 +010010127Py_ssize_t
10128PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10129 Py_UCS4 fill_char)
10130{
10131 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010132
10133 if (!PyUnicode_Check(unicode)) {
10134 PyErr_BadInternalCall();
10135 return -1;
10136 }
10137 if (PyUnicode_READY(unicode) == -1)
10138 return -1;
10139 if (unicode_check_modifiable(unicode))
10140 return -1;
10141
Victor Stinnerd3f08822012-05-29 12:57:52 +020010142 if (start < 0) {
10143 PyErr_SetString(PyExc_IndexError, "string index out of range");
10144 return -1;
10145 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010146 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10147 PyErr_SetString(PyExc_ValueError,
10148 "fill character is bigger than "
10149 "the string maximum character");
10150 return -1;
10151 }
10152
10153 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10154 length = Py_MIN(maxlen, length);
10155 if (length <= 0)
10156 return 0;
10157
Victor Stinnerd3f08822012-05-29 12:57:52 +020010158 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010159 return length;
10160}
10161
Victor Stinner9310abb2011-10-05 00:59:23 +020010162static PyObject *
10163pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010164 Py_ssize_t left,
10165 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 PyObject *u;
10169 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010170 int kind;
10171 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
10173 if (left < 0)
10174 left = 0;
10175 if (right < 0)
10176 right = 0;
10177
Victor Stinnerc4b49542011-12-11 22:44:26 +010010178 if (left == 0 && right == 0)
10179 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10182 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010183 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10184 return NULL;
10185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010187 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010189 if (!u)
10190 return NULL;
10191
10192 kind = PyUnicode_KIND(u);
10193 data = PyUnicode_DATA(u);
10194 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010195 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010196 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010197 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010198 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010199 assert(_PyUnicode_CheckConsistency(u, 1));
10200 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201}
10202
Alexander Belopolsky40018472011-02-26 01:02:56 +000010203PyObject *
10204PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010208 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Benjamin Petersonead6b532011-12-20 17:23:42 -060010211 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 if (PyUnicode_IS_ASCII(string))
10214 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 PyUnicode_GET_LENGTH(string), keepends);
10217 else
10218 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010220 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 break;
10222 case PyUnicode_2BYTE_KIND:
10223 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 PyUnicode_GET_LENGTH(string), keepends);
10226 break;
10227 case PyUnicode_4BYTE_KIND:
10228 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 PyUnicode_GET_LENGTH(string), keepends);
10231 break;
10232 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010233 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236}
10237
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010239split(PyObject *self,
10240 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010243 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 void *buf1, *buf2;
10245 Py_ssize_t len1, len2;
10246 PyObject* out;
10247
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010249 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (PyUnicode_READY(self) == -1)
10252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010255 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 if (PyUnicode_IS_ASCII(self))
10258 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010259 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 PyUnicode_GET_LENGTH(self), maxcount
10261 );
10262 else
10263 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 case PyUnicode_2BYTE_KIND:
10268 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
10272 case PyUnicode_4BYTE_KIND:
10273 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010278 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 }
10280
10281 if (PyUnicode_READY(substring) == -1)
10282 return NULL;
10283
10284 kind1 = PyUnicode_KIND(self);
10285 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 len1 = PyUnicode_GET_LENGTH(self);
10287 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010288 if (kind1 < kind2 || len1 < len2) {
10289 out = PyList_New(1);
10290 if (out == NULL)
10291 return NULL;
10292 Py_INCREF(self);
10293 PyList_SET_ITEM(out, 0, self);
10294 return out;
10295 }
10296 buf1 = PyUnicode_DATA(self);
10297 buf2 = PyUnicode_DATA(substring);
10298 if (kind2 != kind1) {
10299 buf2 = _PyUnicode_AsKind(substring, kind1);
10300 if (!buf2)
10301 return NULL;
10302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010306 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10307 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010309 else
10310 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 break;
10313 case PyUnicode_2BYTE_KIND:
10314 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 case PyUnicode_4BYTE_KIND:
10318 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 break;
10321 default:
10322 out = NULL;
10323 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 PyMem_Free(buf2);
10326 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Alexander Belopolsky40018472011-02-26 01:02:56 +000010329static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010330rsplit(PyObject *self,
10331 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010332 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010333{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010334 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 void *buf1, *buf2;
10336 Py_ssize_t len1, len2;
10337 PyObject* out;
10338
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010339 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010340 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_READY(self) == -1)
10343 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010346 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 if (PyUnicode_IS_ASCII(self))
10349 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 else
10354 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 case PyUnicode_2BYTE_KIND:
10359 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 PyUnicode_GET_LENGTH(self), maxcount
10362 );
10363 case PyUnicode_4BYTE_KIND:
10364 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 PyUnicode_GET_LENGTH(self), maxcount
10367 );
10368 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010369 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 }
10371
10372 if (PyUnicode_READY(substring) == -1)
10373 return NULL;
10374
10375 kind1 = PyUnicode_KIND(self);
10376 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 len1 = PyUnicode_GET_LENGTH(self);
10378 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 if (kind1 < kind2 || len1 < len2) {
10380 out = PyList_New(1);
10381 if (out == NULL)
10382 return NULL;
10383 Py_INCREF(self);
10384 PyList_SET_ITEM(out, 0, self);
10385 return out;
10386 }
10387 buf1 = PyUnicode_DATA(self);
10388 buf2 = PyUnicode_DATA(substring);
10389 if (kind2 != kind1) {
10390 buf2 = _PyUnicode_AsKind(substring, kind1);
10391 if (!buf2)
10392 return NULL;
10393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10398 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 else
10401 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 break;
10404 case PyUnicode_2BYTE_KIND:
10405 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 break;
10408 case PyUnicode_4BYTE_KIND:
10409 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010410 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 break;
10412 default:
10413 out = NULL;
10414 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010415 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyMem_Free(buf2);
10417 return out;
10418}
10419
10420static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10422 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010424 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10427 return asciilib_find(buf1, len1, buf2, len2, offset);
10428 else
10429 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_2BYTE_KIND:
10431 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10434 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010435 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436}
10437
10438static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010439anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10440 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010442 switch (kind) {
10443 case PyUnicode_1BYTE_KIND:
10444 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10445 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10446 else
10447 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10448 case PyUnicode_2BYTE_KIND:
10449 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10450 case PyUnicode_4BYTE_KIND:
10451 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10452 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010453 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010454}
10455
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456static void
10457replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10458 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10459{
10460 int kind = PyUnicode_KIND(u);
10461 void *data = PyUnicode_DATA(u);
10462 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10463 if (kind == PyUnicode_1BYTE_KIND) {
10464 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10465 (Py_UCS1 *)data + len,
10466 u1, u2, maxcount);
10467 }
10468 else if (kind == PyUnicode_2BYTE_KIND) {
10469 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10470 (Py_UCS2 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else {
10474 assert(kind == PyUnicode_4BYTE_KIND);
10475 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10476 (Py_UCS4 *)data + len,
10477 u1, u2, maxcount);
10478 }
10479}
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482replace(PyObject *self, PyObject *str1,
10483 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyObject *u;
10486 char *sbuf = PyUnicode_DATA(self);
10487 char *buf1 = PyUnicode_DATA(str1);
10488 char *buf2 = PyUnicode_DATA(str2);
10489 int srelease = 0, release1 = 0, release2 = 0;
10490 int skind = PyUnicode_KIND(self);
10491 int kind1 = PyUnicode_KIND(str1);
10492 int kind2 = PyUnicode_KIND(str2);
10493 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10494 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10495 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010500 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010502 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
Victor Stinner59de0ee2011-10-07 10:01:28 +020010504 if (str1 == str2)
10505 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10509 if (maxchar < maxchar_str1)
10510 /* substring too wide to be present */
10511 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010512 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10513 /* Replacing str1 with str2 may cause a maxchar reduction in the
10514 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010515 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010516 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010524 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010525 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010526
Victor Stinner69ed0f42013-04-09 21:48:24 +020010527 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010528 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010529 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010531 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010536 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10537 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 }
10539 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 int rkind = skind;
10541 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010542 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (kind1 < rkind) {
10545 /* widen substring */
10546 buf1 = _PyUnicode_AsKind(str1, rkind);
10547 if (!buf1) goto error;
10548 release1 = 1;
10549 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010550 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551 if (i < 0)
10552 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (rkind > kind2) {
10554 /* widen replacement */
10555 buf2 = _PyUnicode_AsKind(str2, rkind);
10556 if (!buf2) goto error;
10557 release2 = 1;
10558 }
10559 else if (rkind < kind2) {
10560 /* widen self and buf1 */
10561 rkind = kind2;
10562 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010563 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 sbuf = _PyUnicode_AsKind(self, rkind);
10565 if (!sbuf) goto error;
10566 srelease = 1;
10567 buf1 = _PyUnicode_AsKind(str1, rkind);
10568 if (!buf1) goto error;
10569 release1 = 1;
10570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 u = PyUnicode_New(slen, maxchar);
10572 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 assert(PyUnicode_KIND(u) == rkind);
10575 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010576
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010577 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583
10584 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 if (i == -1)
10589 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 }
10597 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010599 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 int rkind = skind;
10601 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010604 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 if (n == 0)
10611 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 buf2 = _PyUnicode_AsKind(str2, rkind);
10615 if (!buf2) goto error;
10616 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 rkind = kind2;
10621 sbuf = _PyUnicode_AsKind(self, rkind);
10622 if (!sbuf) goto error;
10623 srelease = 1;
10624 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010625 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 buf1 = _PyUnicode_AsKind(str1, rkind);
10627 if (!buf1) goto error;
10628 release1 = 1;
10629 }
10630 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10631 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010632 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 PyErr_SetString(PyExc_OverflowError,
10634 "replace string is too long");
10635 goto error;
10636 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010637 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010639 _Py_INCREF_UNICODE_EMPTY();
10640 if (!unicode_empty)
10641 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010642 u = unicode_empty;
10643 goto done;
10644 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010645 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 PyErr_SetString(PyExc_OverflowError,
10647 "replace string is too long");
10648 goto error;
10649 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 u = PyUnicode_New(new_size, maxchar);
10651 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010653 assert(PyUnicode_KIND(u) == rkind);
10654 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 ires = i = 0;
10656 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 while (n-- > 0) {
10658 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010662 if (j == -1)
10663 break;
10664 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
10671 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
10683 sbuf + rkind * i,
10684 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 }
10686 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 /* interleave */
10688 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 if (--n <= 0)
10694 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 ires++;
10699 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705 }
10706
10707 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010708 unicode_adjust_maxchar(&u);
10709 if (u == NULL)
10710 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712
10713 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (srelease)
10715 PyMem_FREE(sbuf);
10716 if (release1)
10717 PyMem_FREE(buf1);
10718 if (release2)
10719 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010720 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (srelease)
10726 PyMem_FREE(sbuf);
10727 if (release1)
10728 PyMem_FREE(buf1);
10729 if (release2)
10730 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010731 return unicode_result_unchanged(self);
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 error:
10734 if (srelease && sbuf)
10735 PyMem_FREE(sbuf);
10736 if (release1 && buf1)
10737 PyMem_FREE(buf1);
10738 if (release2 && buf2)
10739 PyMem_FREE(buf2);
10740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741}
10742
10743/* --- Unicode Object Methods --------------------------------------------- */
10744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745/*[clinic input]
10746str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
INADA Naoki3ae20562017-01-16 20:41:20 +090010748Return a version of the string where each word is titlecased.
10749
10750More specifically, words start with uppercased characters and all remaining
10751cased characters have lower case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010756/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010760 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761}
10762
INADA Naoki3ae20562017-01-16 20:41:20 +090010763/*[clinic input]
10764str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
INADA Naoki3ae20562017-01-16 20:41:20 +090010766Return a capitalized version of the string.
10767
10768More specifically, make the first character have upper case and the rest lower
10769case.
10770[clinic start generated code]*/
10771
10772static PyObject *
10773unicode_capitalize_impl(PyObject *self)
10774/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_GET_LENGTH(self) == 0)
10779 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010780 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781}
10782
INADA Naoki3ae20562017-01-16 20:41:20 +090010783/*[clinic input]
10784str.casefold as unicode_casefold
10785
10786Return a version of the string suitable for caseless comparisons.
10787[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010788
10789static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010790unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010791/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792{
10793 if (PyUnicode_READY(self) == -1)
10794 return NULL;
10795 if (PyUnicode_IS_ASCII(self))
10796 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010797 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010798}
10799
10800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010802
10803static int
10804convert_uc(PyObject *obj, void *addr)
10805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010808 if (!PyUnicode_Check(obj)) {
10809 PyErr_Format(PyExc_TypeError,
10810 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010811 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 if (PyUnicode_READY(obj) < 0)
10815 return 0;
10816 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 return 0;
10820 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010821 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010822 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010823}
10824
INADA Naoki3ae20562017-01-16 20:41:20 +090010825/*[clinic input]
10826str.center as unicode_center
10827
10828 width: Py_ssize_t
10829 fillchar: Py_UCS4 = ' '
10830 /
10831
10832Return a centered string of length width.
10833
10834Padding is done using the specified fill character (default is a space).
10835[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010838unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10839/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010841 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Benjamin Petersonbac79492012-01-14 13:34:47 -050010843 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
10845
Victor Stinnerc4b49542011-12-11 22:44:26 +010010846 if (PyUnicode_GET_LENGTH(self) >= width)
10847 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848
Victor Stinnerc4b49542011-12-11 22:44:26 +010010849 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 left = marg / 2 + (marg & width & 1);
10851
Victor Stinner9310abb2011-10-05 00:59:23 +020010852 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853}
10854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855/* This function assumes that str1 and str2 are readied by the caller. */
10856
Marc-André Lemburge5034372000-08-08 08:04:29 +000010857static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010858unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010860#define COMPARE(TYPE1, TYPE2) \
10861 do { \
10862 TYPE1* p1 = (TYPE1 *)data1; \
10863 TYPE2* p2 = (TYPE2 *)data2; \
10864 TYPE1* end = p1 + len; \
10865 Py_UCS4 c1, c2; \
10866 for (; p1 != end; p1++, p2++) { \
10867 c1 = *p1; \
10868 c2 = *p2; \
10869 if (c1 != c2) \
10870 return (c1 < c2) ? -1 : 1; \
10871 } \
10872 } \
10873 while (0)
10874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 int kind1, kind2;
10876 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 kind1 = PyUnicode_KIND(str1);
10880 kind2 = PyUnicode_KIND(str2);
10881 data1 = PyUnicode_DATA(str1);
10882 data2 = PyUnicode_DATA(str2);
10883 len1 = PyUnicode_GET_LENGTH(str1);
10884 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010885 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010886
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 switch(kind1) {
10888 case PyUnicode_1BYTE_KIND:
10889 {
10890 switch(kind2) {
10891 case PyUnicode_1BYTE_KIND:
10892 {
10893 int cmp = memcmp(data1, data2, len);
10894 /* normalize result of memcmp() into the range [-1; 1] */
10895 if (cmp < 0)
10896 return -1;
10897 if (cmp > 0)
10898 return 1;
10899 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010900 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS1, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
10905 COMPARE(Py_UCS1, Py_UCS4);
10906 break;
10907 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010908 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010909 }
10910 break;
10911 }
10912 case PyUnicode_2BYTE_KIND:
10913 {
10914 switch(kind2) {
10915 case PyUnicode_1BYTE_KIND:
10916 COMPARE(Py_UCS2, Py_UCS1);
10917 break;
10918 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010919 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 COMPARE(Py_UCS2, Py_UCS2);
10921 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010922 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010923 case PyUnicode_4BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS4);
10925 break;
10926 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010927 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 }
10929 break;
10930 }
10931 case PyUnicode_4BYTE_KIND:
10932 {
10933 switch(kind2) {
10934 case PyUnicode_1BYTE_KIND:
10935 COMPARE(Py_UCS4, Py_UCS1);
10936 break;
10937 case PyUnicode_2BYTE_KIND:
10938 COMPARE(Py_UCS4, Py_UCS2);
10939 break;
10940 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010941 {
10942#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10943 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10944 /* normalize result of wmemcmp() into the range [-1; 1] */
10945 if (cmp < 0)
10946 return -1;
10947 if (cmp > 0)
10948 return 1;
10949#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010951#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010953 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010955 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010956 }
10957 break;
10958 }
10959 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010960 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961 }
10962
Victor Stinner770e19e2012-10-04 22:59:45 +020010963 if (len1 == len2)
10964 return 0;
10965 if (len1 < len2)
10966 return -1;
10967 else
10968 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010969
10970#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010971}
10972
Benjamin Peterson621b4302016-09-09 13:54:34 -070010973static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974unicode_compare_eq(PyObject *str1, PyObject *str2)
10975{
10976 int kind;
10977 void *data1, *data2;
10978 Py_ssize_t len;
10979 int cmp;
10980
Victor Stinnere5567ad2012-10-23 02:48:49 +020010981 len = PyUnicode_GET_LENGTH(str1);
10982 if (PyUnicode_GET_LENGTH(str2) != len)
10983 return 0;
10984 kind = PyUnicode_KIND(str1);
10985 if (PyUnicode_KIND(str2) != kind)
10986 return 0;
10987 data1 = PyUnicode_DATA(str1);
10988 data2 = PyUnicode_DATA(str2);
10989
10990 cmp = memcmp(data1, data2, len * kind);
10991 return (cmp == 0);
10992}
10993
10994
Alexander Belopolsky40018472011-02-26 01:02:56 +000010995int
10996PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10999 if (PyUnicode_READY(left) == -1 ||
11000 PyUnicode_READY(right) == -1)
11001 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011002
11003 /* a string is equal to itself */
11004 if (left == right)
11005 return 0;
11006
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011009 PyErr_Format(PyExc_TypeError,
11010 "Can't compare %.100s and %.100s",
11011 left->ob_type->tp_name,
11012 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return -1;
11014}
11015
Martin v. Löwis5b222132007-06-10 09:51:05 +000011016int
11017PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_ssize_t i;
11020 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011022 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Victor Stinner910337b2011-10-03 03:20:16 +020011024 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011025 if (!PyUnicode_IS_READY(uni)) {
11026 const wchar_t *ws = _PyUnicode_WSTR(uni);
11027 /* Compare Unicode string and source character set string */
11028 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11029 if (chr != ustr[i])
11030 return (chr < ustr[i]) ? -1 : 1;
11031 }
11032 /* This check keeps Python strings that end in '\0' from comparing equal
11033 to C strings identical up to that point. */
11034 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11035 return 1; /* uni is longer */
11036 if (ustr[i])
11037 return -1; /* str is longer */
11038 return 0;
11039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011041 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011042 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011043 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011044 size_t len, len2 = strlen(str);
11045 int cmp;
11046
11047 len = Py_MIN(len1, len2);
11048 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011049 if (cmp != 0) {
11050 if (cmp < 0)
11051 return -1;
11052 else
11053 return 1;
11054 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011055 if (len1 > len2)
11056 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011057 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011058 return -1; /* str is longer */
11059 return 0;
11060 }
11061 else {
11062 void *data = PyUnicode_DATA(uni);
11063 /* Compare Unicode string and source character set string */
11064 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011065 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011066 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11067 /* This check keeps Python strings that end in '\0' from comparing equal
11068 to C strings identical up to that point. */
11069 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11070 return 1; /* uni is longer */
11071 if (str[i])
11072 return -1; /* str is longer */
11073 return 0;
11074 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011075}
11076
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011077static int
11078non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11079{
11080 size_t i, len;
11081 const wchar_t *p;
11082 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11083 if (strlen(str) != len)
11084 return 0;
11085 p = _PyUnicode_WSTR(unicode);
11086 assert(p);
11087 for (i = 0; i < len; i++) {
11088 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011089 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011090 return 0;
11091 }
11092 return 1;
11093}
11094
11095int
11096_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11097{
11098 size_t len;
11099 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011100 assert(str);
11101#ifndef NDEBUG
11102 for (const char *p = str; *p; p++) {
11103 assert((unsigned char)*p < 128);
11104 }
11105#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011106 if (PyUnicode_READY(unicode) == -1) {
11107 /* Memory error or bad data */
11108 PyErr_Clear();
11109 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11110 }
11111 if (!PyUnicode_IS_ASCII(unicode))
11112 return 0;
11113 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11114 return strlen(str) == len &&
11115 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116}
11117
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011118int
11119_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11120{
11121 PyObject *right_uni;
11122 Py_hash_t hash;
11123
11124 assert(_PyUnicode_CHECK(left));
11125 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011126#ifndef NDEBUG
11127 for (const char *p = right->string; *p; p++) {
11128 assert((unsigned char)*p < 128);
11129 }
11130#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011131
11132 if (PyUnicode_READY(left) == -1) {
11133 /* memory error or bad data */
11134 PyErr_Clear();
11135 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11136 }
11137
11138 if (!PyUnicode_IS_ASCII(left))
11139 return 0;
11140
11141 right_uni = _PyUnicode_FromId(right); /* borrowed */
11142 if (right_uni == NULL) {
11143 /* memory error or bad data */
11144 PyErr_Clear();
11145 return _PyUnicode_EqualToASCIIString(left, right->string);
11146 }
11147
11148 if (left == right_uni)
11149 return 1;
11150
11151 if (PyUnicode_CHECK_INTERNED(left))
11152 return 0;
11153
INADA Naoki7cc95f52018-01-28 02:07:09 +090011154 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011155 hash = _PyUnicode_HASH(left);
11156 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11157 return 0;
11158
11159 return unicode_compare_eq(left, right_uni);
11160}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011161
Alexander Belopolsky40018472011-02-26 01:02:56 +000011162PyObject *
11163PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011164{
11165 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011166
Victor Stinnere5567ad2012-10-23 02:48:49 +020011167 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11168 Py_RETURN_NOTIMPLEMENTED;
11169
11170 if (PyUnicode_READY(left) == -1 ||
11171 PyUnicode_READY(right) == -1)
11172 return NULL;
11173
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011174 if (left == right) {
11175 switch (op) {
11176 case Py_EQ:
11177 case Py_LE:
11178 case Py_GE:
11179 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011180 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011181 case Py_NE:
11182 case Py_LT:
11183 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011184 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011185 default:
11186 PyErr_BadArgument();
11187 return NULL;
11188 }
11189 }
11190 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011191 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011192 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011193 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011194 }
11195 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011196 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011197 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011198 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011199}
11200
Alexander Belopolsky40018472011-02-26 01:02:56 +000011201int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011202_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11203{
11204 return unicode_eq(aa, bb);
11205}
11206
11207int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011209{
Victor Stinner77282cb2013-04-14 19:22:47 +020011210 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 void *buf1, *buf2;
11212 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011213 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011217 "'in <string>' requires string as left operand, not %.100s",
11218 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011219 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011220 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011222 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (ensure_unicode(str) < 0)
11224 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 kind2 = PyUnicode_KIND(substr);
11228 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011229 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 len2 = PyUnicode_GET_LENGTH(substr);
11232 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011233 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011236 if (len2 == 1) {
11237 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11238 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 return result;
11240 }
11241 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 buf2 = _PyUnicode_AsKind(substr, kind1);
11243 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246
Victor Stinner77282cb2013-04-14 19:22:47 +020011247 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 case PyUnicode_1BYTE_KIND:
11249 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11250 break;
11251 case PyUnicode_2BYTE_KIND:
11252 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11253 break;
11254 case PyUnicode_4BYTE_KIND:
11255 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11256 break;
11257 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011258 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011260
Victor Stinner77282cb2013-04-14 19:22:47 +020011261 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 PyMem_Free(buf2);
11263
Guido van Rossum403d68b2000-03-13 15:55:09 +000011264 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011265}
11266
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267/* Concat to string or Unicode object giving a new Unicode object. */
11268
Alexander Belopolsky40018472011-02-26 01:02:56 +000011269PyObject *
11270PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011273 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011274 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011276 if (ensure_unicode(left) < 0)
11277 return NULL;
11278
11279 if (!PyUnicode_Check(right)) {
11280 PyErr_Format(PyExc_TypeError,
11281 "can only concatenate str (not \"%.200s\") to str",
11282 right->ob_type->tp_name);
11283 return NULL;
11284 }
11285 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (left == unicode_empty)
11290 return PyUnicode_FromObject(right);
11291 if (right == unicode_empty)
11292 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 left_len = PyUnicode_GET_LENGTH(left);
11295 right_len = PyUnicode_GET_LENGTH(right);
11296 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011297 PyErr_SetString(PyExc_OverflowError,
11298 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011300 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11304 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011305 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 result = PyUnicode_New(new_len, maxchar);
11309 if (result == NULL)
11310 return NULL;
11311 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11312 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11313 assert(_PyUnicode_CheckConsistency(result, 1));
11314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Walter Dörwald1ab83302007-05-18 17:15:44 +000011317void
Victor Stinner23e56682011-10-03 03:54:37 +020011318PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011319{
Victor Stinner23e56682011-10-03 03:54:37 +020011320 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011321 Py_UCS4 maxchar, maxchar2;
11322 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011323
11324 if (p_left == NULL) {
11325 if (!PyErr_Occurred())
11326 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011327 return;
11328 }
Victor Stinner23e56682011-10-03 03:54:37 +020011329 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011330 if (right == NULL || left == NULL
11331 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011332 if (!PyErr_Occurred())
11333 PyErr_BadInternalCall();
11334 goto error;
11335 }
11336
Benjamin Petersonbac79492012-01-14 13:34:47 -050011337 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011338 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011339 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011340 goto error;
11341
Victor Stinner488fa492011-12-12 00:01:39 +010011342 /* Shortcuts */
11343 if (left == unicode_empty) {
11344 Py_DECREF(left);
11345 Py_INCREF(right);
11346 *p_left = right;
11347 return;
11348 }
11349 if (right == unicode_empty)
11350 return;
11351
11352 left_len = PyUnicode_GET_LENGTH(left);
11353 right_len = PyUnicode_GET_LENGTH(right);
11354 if (left_len > PY_SSIZE_T_MAX - right_len) {
11355 PyErr_SetString(PyExc_OverflowError,
11356 "strings are too large to concat");
11357 goto error;
11358 }
11359 new_len = left_len + right_len;
11360
11361 if (unicode_modifiable(left)
11362 && PyUnicode_CheckExact(right)
11363 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011364 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11365 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011366 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011367 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011368 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11369 {
11370 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011371 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011372 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011373
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011374 /* copy 'right' into the newly allocated area of 'left' */
11375 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011376 }
Victor Stinner488fa492011-12-12 00:01:39 +010011377 else {
11378 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11379 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011380 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011381
Victor Stinner488fa492011-12-12 00:01:39 +010011382 /* Concat the two Unicode strings */
11383 res = PyUnicode_New(new_len, maxchar);
11384 if (res == NULL)
11385 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011386 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11387 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011388 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011389 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011390 }
11391 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011392 return;
11393
11394error:
Victor Stinner488fa492011-12-12 00:01:39 +010011395 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011396}
11397
11398void
11399PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 PyUnicode_Append(pleft, right);
11402 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011403}
11404
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011405/*
11406Wraps stringlib_parse_args_finds() and additionally ensures that the
11407first argument is a unicode object.
11408*/
11409
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011410static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011411parse_args_finds_unicode(const char * function_name, PyObject *args,
11412 PyObject **substring,
11413 Py_ssize_t *start, Py_ssize_t *end)
11414{
11415 if(stringlib_parse_args_finds(function_name, args, substring,
11416 start, end)) {
11417 if (ensure_unicode(*substring) < 0)
11418 return 0;
11419 return 1;
11420 }
11421 return 0;
11422}
11423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011427Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011428string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011432unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011434 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011435 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011436 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011438 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 void *buf1, *buf2;
11440 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011442 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 kind1 = PyUnicode_KIND(self);
11446 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011448 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 len1 = PyUnicode_GET_LENGTH(self);
11451 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 buf1 = PyUnicode_DATA(self);
11457 buf2 = PyUnicode_DATA(substring);
11458 if (kind2 != kind1) {
11459 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 }
11463 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 case PyUnicode_1BYTE_KIND:
11465 iresult = ucs1lib_count(
11466 ((Py_UCS1*)buf1) + start, end - start,
11467 buf2, len2, PY_SSIZE_T_MAX
11468 );
11469 break;
11470 case PyUnicode_2BYTE_KIND:
11471 iresult = ucs2lib_count(
11472 ((Py_UCS2*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 case PyUnicode_4BYTE_KIND:
11477 iresult = ucs4lib_count(
11478 ((Py_UCS4*)buf1) + start, end - start,
11479 buf2, len2, PY_SSIZE_T_MAX
11480 );
11481 break;
11482 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011483 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 }
11485
11486 result = PyLong_FromSsize_t(iresult);
11487
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011488 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 return result;
11492}
11493
INADA Naoki3ae20562017-01-16 20:41:20 +090011494/*[clinic input]
11495str.encode as unicode_encode
11496
11497 encoding: str(c_default="NULL") = 'utf-8'
11498 The encoding in which to encode the string.
11499 errors: str(c_default="NULL") = 'strict'
11500 The error handling scheme to use for encoding errors.
11501 The default is 'strict' meaning that encoding errors raise a
11502 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11503 'xmlcharrefreplace' as well as any other name registered with
11504 codecs.register_error that can handle UnicodeEncodeErrors.
11505
11506Encode the string using the codec registered for encoding.
11507[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011510unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011511/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011513 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011514}
11515
INADA Naoki3ae20562017-01-16 20:41:20 +090011516/*[clinic input]
11517str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
INADA Naoki3ae20562017-01-16 20:41:20 +090011519 tabsize: int = 8
11520
11521Return a copy where all tab characters are expanded using spaces.
11522
11523If tabsize is not given, a tab size of 8 characters is assumed.
11524[clinic start generated code]*/
11525
11526static PyObject *
11527unicode_expandtabs_impl(PyObject *self, int tabsize)
11528/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 Py_ssize_t i, j, line_pos, src_len, incr;
11531 Py_UCS4 ch;
11532 PyObject *u;
11533 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011534 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011535 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Antoine Pitrou22425222011-10-04 19:10:51 +020011537 if (PyUnicode_READY(self) == -1)
11538 return NULL;
11539
Thomas Wouters7e474022000-07-16 12:04:32 +000011540 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011541 src_len = PyUnicode_GET_LENGTH(self);
11542 i = j = line_pos = 0;
11543 kind = PyUnicode_KIND(self);
11544 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011545 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 for (; i < src_len; i++) {
11547 ch = PyUnicode_READ(kind, src_data, i);
11548 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011549 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 goto overflow;
11554 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011556 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 goto overflow;
11561 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011563 if (ch == '\n' || ch == '\r')
11564 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011567 if (!found)
11568 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011569
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 if (!u)
11573 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 for (; i < src_len; i++) {
11579 ch = PyUnicode_READ(kind, src_data, i);
11580 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 incr = tabsize - (line_pos % tabsize);
11583 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011584 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011585 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011587 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 line_pos++;
11590 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011591 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 if (ch == '\n' || ch == '\r')
11593 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 }
11596 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011597 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011598
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
11607Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011608such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609arguments start and end are interpreted as in slice notation.\n\
11610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 /* initialize variables to prevent gcc warning */
11617 PyObject *substring = NULL;
11618 Py_ssize_t start = 0;
11619 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011625 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Christian Heimes217cfd12007-12-02 14:31:20 +000011633 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
11636static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011637unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011639 void *data;
11640 enum PyUnicode_Kind kind;
11641 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011642
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011643 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011646 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011647 if (PyUnicode_READY(self) == -1) {
11648 return NULL;
11649 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011650 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11651 PyErr_SetString(PyExc_IndexError, "string index out of range");
11652 return NULL;
11653 }
11654 kind = PyUnicode_KIND(self);
11655 data = PyUnicode_DATA(self);
11656 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011657 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Guido van Rossumc2504932007-09-18 19:42:40 +000011660/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011661 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011662static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011665 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011666
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011667#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011668 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011669#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (_PyUnicode_HASH(self) != -1)
11671 return _PyUnicode_HASH(self);
11672 if (PyUnicode_READY(self) == -1)
11673 return -1;
animalizea1d14252019-01-02 20:16:06 +080011674
Christian Heimes985ecdc2013-11-20 11:46:18 +010011675 x = _Py_HashBytes(PyUnicode_DATA(self),
11676 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011678 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679}
11680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011681PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683\n\
oldkaa0735f2018-02-02 16:52:55 +080011684Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011685such that sub is contained within S[start:end]. Optional\n\
11686arguments start and end are interpreted as in slice notation.\n\
11687\n\
11688Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011693 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011694 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011695 PyObject *substring = NULL;
11696 Py_ssize_t start = 0;
11697 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011699 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011702 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011705 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (result == -2)
11708 return NULL;
11709
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 if (result < 0) {
11711 PyErr_SetString(PyExc_ValueError, "substring not found");
11712 return NULL;
11713 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011714
Christian Heimes217cfd12007-12-02 14:31:20 +000011715 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716}
11717
INADA Naoki3ae20562017-01-16 20:41:20 +090011718/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011719str.isascii as unicode_isascii
11720
11721Return True if all characters in the string are ASCII, False otherwise.
11722
11723ASCII characters have code points in the range U+0000-U+007F.
11724Empty string is ASCII too.
11725[clinic start generated code]*/
11726
11727static PyObject *
11728unicode_isascii_impl(PyObject *self)
11729/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11730{
11731 if (PyUnicode_READY(self) == -1) {
11732 return NULL;
11733 }
11734 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11735}
11736
11737/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011738str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
INADA Naoki3ae20562017-01-16 20:41:20 +090011740Return True if the string is a lowercase string, False otherwise.
11741
11742A string is lowercase if all cased characters in the string are lowercase and
11743there is at least one cased character in the string.
11744[clinic start generated code]*/
11745
11746static PyObject *
11747unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011748/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 Py_ssize_t i, length;
11751 int kind;
11752 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 int cased;
11754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (PyUnicode_READY(self) == -1)
11756 return NULL;
11757 length = PyUnicode_GET_LENGTH(self);
11758 kind = PyUnicode_KIND(self);
11759 data = PyUnicode_DATA(self);
11760
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (length == 1)
11763 return PyBool_FromLong(
11764 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011766 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011768 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 for (i = 0; i < length; i++) {
11772 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011773
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011775 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 else if (!cased && Py_UNICODE_ISLOWER(ch))
11777 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011779 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
INADA Naoki3ae20562017-01-16 20:41:20 +090011782/*[clinic input]
11783str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
INADA Naoki3ae20562017-01-16 20:41:20 +090011785Return True if the string is an uppercase string, False otherwise.
11786
11787A string is uppercase if all cased characters in the string are uppercase and
11788there is at least one cased character in the string.
11789[clinic start generated code]*/
11790
11791static PyObject *
11792unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011793/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 Py_ssize_t i, length;
11796 int kind;
11797 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 int cased;
11799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (PyUnicode_READY(self) == -1)
11801 return NULL;
11802 length = PyUnicode_GET_LENGTH(self);
11803 kind = PyUnicode_KIND(self);
11804 data = PyUnicode_DATA(self);
11805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (length == 1)
11808 return PyBool_FromLong(
11809 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011811 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011813 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 for (i = 0; i < length; i++) {
11817 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011818
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011820 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 else if (!cased && Py_UNICODE_ISUPPER(ch))
11822 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011824 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827/*[clinic input]
11828str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
INADA Naoki3ae20562017-01-16 20:41:20 +090011830Return True if the string is a title-cased string, False otherwise.
11831
11832In a title-cased string, upper- and title-case characters may only
11833follow uncased characters and lowercase characters only cased ones.
11834[clinic start generated code]*/
11835
11836static PyObject *
11837unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011838/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 Py_ssize_t i, length;
11841 int kind;
11842 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 int cased, previous_is_cased;
11844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (PyUnicode_READY(self) == -1)
11846 return NULL;
11847 length = PyUnicode_GET_LENGTH(self);
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
11850
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (length == 1) {
11853 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11854 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11855 (Py_UNICODE_ISUPPER(ch) != 0));
11856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011860 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862 cased = 0;
11863 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 for (i = 0; i < length; i++) {
11865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011866
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11868 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011869 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 previous_is_cased = 1;
11871 cased = 1;
11872 }
11873 else if (Py_UNICODE_ISLOWER(ch)) {
11874 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011875 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 previous_is_cased = 1;
11877 cased = 1;
11878 }
11879 else
11880 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011882 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883}
11884
INADA Naoki3ae20562017-01-16 20:41:20 +090011885/*[clinic input]
11886str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
INADA Naoki3ae20562017-01-16 20:41:20 +090011888Return True if the string is a whitespace string, False otherwise.
11889
11890A string is whitespace if all characters in the string are whitespace and there
11891is at least one character in the string.
11892[clinic start generated code]*/
11893
11894static PyObject *
11895unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011896/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 Py_ssize_t i, length;
11899 int kind;
11900 void *data;
11901
11902 if (PyUnicode_READY(self) == -1)
11903 return NULL;
11904 length = PyUnicode_GET_LENGTH(self);
11905 kind = PyUnicode_KIND(self);
11906 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (length == 1)
11910 return PyBool_FromLong(
11911 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011913 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011915 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 for (i = 0; i < length; i++) {
11918 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011919 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011920 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
INADA Naoki3ae20562017-01-16 20:41:20 +090011925/*[clinic input]
11926str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011927
INADA Naoki3ae20562017-01-16 20:41:20 +090011928Return True if the string is an alphabetic string, False otherwise.
11929
11930A string is alphabetic if all characters in the string are alphabetic and there
11931is at least one character in the string.
11932[clinic start generated code]*/
11933
11934static PyObject *
11935unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011936/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_ssize_t i, length;
11939 int kind;
11940 void *data;
11941
11942 if (PyUnicode_READY(self) == -1)
11943 return NULL;
11944 length = PyUnicode_GET_LENGTH(self);
11945 kind = PyUnicode_KIND(self);
11946 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011947
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 1)
11950 return PyBool_FromLong(
11951 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952
11953 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011955 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 for (i = 0; i < length; i++) {
11958 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011959 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011961 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962}
11963
INADA Naoki3ae20562017-01-16 20:41:20 +090011964/*[clinic input]
11965str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
INADA Naoki3ae20562017-01-16 20:41:20 +090011967Return True if the string is an alpha-numeric string, False otherwise.
11968
11969A string is alpha-numeric if all characters in the string are alpha-numeric and
11970there is at least one character in the string.
11971[clinic start generated code]*/
11972
11973static PyObject *
11974unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011975/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 int kind;
11978 void *data;
11979 Py_ssize_t len, i;
11980
11981 if (PyUnicode_READY(self) == -1)
11982 return NULL;
11983
11984 kind = PyUnicode_KIND(self);
11985 data = PyUnicode_DATA(self);
11986 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011988 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (len == 1) {
11990 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11991 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11992 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011993
11994 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 for (i = 0; i < len; i++) {
11999 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012000 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004}
12005
INADA Naoki3ae20562017-01-16 20:41:20 +090012006/*[clinic input]
12007str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
INADA Naoki3ae20562017-01-16 20:41:20 +090012009Return True if the string is a decimal string, False otherwise.
12010
12011A string is a decimal string if all characters in the string are decimal and
12012there is at least one character in the string.
12013[clinic start generated code]*/
12014
12015static PyObject *
12016unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012017/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 Py_ssize_t i, length;
12020 int kind;
12021 void *data;
12022
12023 if (PyUnicode_READY(self) == -1)
12024 return NULL;
12025 length = PyUnicode_GET_LENGTH(self);
12026 kind = PyUnicode_KIND(self);
12027 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (length == 1)
12031 return PyBool_FromLong(
12032 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012034 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012036 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 for (i = 0; i < length; i++) {
12039 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012040 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
INADA Naoki3ae20562017-01-16 20:41:20 +090012045/*[clinic input]
12046str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
INADA Naoki3ae20562017-01-16 20:41:20 +090012048Return True if the string is a digit string, False otherwise.
12049
12050A string is a digit string if all characters in the string are digits and there
12051is at least one character in the string.
12052[clinic start generated code]*/
12053
12054static PyObject *
12055unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012056/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 Py_ssize_t i, length;
12059 int kind;
12060 void *data;
12061
12062 if (PyUnicode_READY(self) == -1)
12063 return NULL;
12064 length = PyUnicode_GET_LENGTH(self);
12065 kind = PyUnicode_KIND(self);
12066 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (length == 1) {
12070 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12071 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012074 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 for (i = 0; i < length; i++) {
12079 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012080 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083}
12084
INADA Naoki3ae20562017-01-16 20:41:20 +090012085/*[clinic input]
12086str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
INADA Naoki3ae20562017-01-16 20:41:20 +090012088Return True if the string is a numeric string, False otherwise.
12089
12090A string is numeric if all characters in the string are numeric and there is at
12091least one character in the string.
12092[clinic start generated code]*/
12093
12094static PyObject *
12095unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012096/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 Py_ssize_t i, length;
12099 int kind;
12100 void *data;
12101
12102 if (PyUnicode_READY(self) == -1)
12103 return NULL;
12104 length = PyUnicode_GET_LENGTH(self);
12105 kind = PyUnicode_KIND(self);
12106 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (length == 1)
12110 return PyBool_FromLong(
12111 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012113 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 for (i = 0; i < length; i++) {
12118 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012119 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122}
12123
Martin v. Löwis47383402007-08-15 07:32:56 +000012124int
12125PyUnicode_IsIdentifier(PyObject *self)
12126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 int kind;
12128 void *data;
12129 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012130 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (PyUnicode_READY(self) == -1) {
12133 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 }
12136
12137 /* Special case for empty strings */
12138 if (PyUnicode_GET_LENGTH(self) == 0)
12139 return 0;
12140 kind = PyUnicode_KIND(self);
12141 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012142
12143 /* PEP 3131 says that the first character must be in
12144 XID_Start and subsequent characters in XID_Continue,
12145 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012147 letters, digits, underscore). However, given the current
12148 definition of XID_Start and XID_Continue, it is sufficient
12149 to check just for these, except that _ must be allowed
12150 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012152 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012153 return 0;
12154
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012155 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012158 return 1;
12159}
12160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161/*[clinic input]
12162str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164Return True if the string is a valid Python identifier, False otherwise.
12165
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012166Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012167such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012168[clinic start generated code]*/
12169
12170static PyObject *
12171unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012172/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012173{
12174 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12175}
12176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177/*[clinic input]
12178str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180Return True if the string is printable, False otherwise.
12181
12182A string is printable if all of its characters are considered printable in
12183repr() or if it is empty.
12184[clinic start generated code]*/
12185
12186static PyObject *
12187unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012188/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 Py_ssize_t i, length;
12191 int kind;
12192 void *data;
12193
12194 if (PyUnicode_READY(self) == -1)
12195 return NULL;
12196 length = PyUnicode_GET_LENGTH(self);
12197 kind = PyUnicode_KIND(self);
12198 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012199
12200 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 1)
12202 return PyBool_FromLong(
12203 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 for (i = 0; i < length; i++) {
12206 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012207 Py_RETURN_FALSE;
12208 }
12209 }
12210 Py_RETURN_TRUE;
12211}
12212
INADA Naoki3ae20562017-01-16 20:41:20 +090012213/*[clinic input]
12214str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
INADA Naoki3ae20562017-01-16 20:41:20 +090012216 iterable: object
12217 /
12218
12219Concatenate any number of strings.
12220
Martin Panter91a88662017-01-24 00:30:06 +000012221The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012222The result is returned as a new string.
12223
12224Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12225[clinic start generated code]*/
12226
12227static PyObject *
12228unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012229/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
INADA Naoki3ae20562017-01-16 20:41:20 +090012231 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232}
12233
Martin v. Löwis18e16552006-02-15 17:27:45 +000012234static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_READY(self) == -1)
12238 return -1;
12239 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242/*[clinic input]
12243str.ljust as unicode_ljust
12244
12245 width: Py_ssize_t
12246 fillchar: Py_UCS4 = ' '
12247 /
12248
12249Return a left-justified string of length width.
12250
12251Padding is done using the specified fill character (default is a space).
12252[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
12254static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012255unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12256/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012258 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
Victor Stinnerc4b49542011-12-11 22:44:26 +010012261 if (PyUnicode_GET_LENGTH(self) >= width)
12262 return unicode_result_unchanged(self);
12263
12264 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267/*[clinic input]
12268str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270Return a copy of the string converted to lowercase.
12271[clinic start generated code]*/
12272
12273static PyObject *
12274unicode_lower_impl(PyObject *self)
12275/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 if (PyUnicode_IS_ASCII(self))
12280 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012281 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282}
12283
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284#define LEFTSTRIP 0
12285#define RIGHTSTRIP 1
12286#define BOTHSTRIP 2
12287
12288/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012289static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
INADA Naoki3ae20562017-01-16 20:41:20 +090012291#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293/* externally visible for str.strip(unicode) */
12294PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012295_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 void *data;
12298 int kind;
12299 Py_ssize_t i, j, len;
12300 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012301 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12304 return NULL;
12305
12306 kind = PyUnicode_KIND(self);
12307 data = PyUnicode_DATA(self);
12308 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12311 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012312 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 i = 0;
12315 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012316 while (i < len) {
12317 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12318 if (!BLOOM(sepmask, ch))
12319 break;
12320 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12321 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 i++;
12323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012325
Benjamin Peterson14339b62009-01-31 16:36:08 +000012326 j = len;
12327 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012328 j--;
12329 while (j >= i) {
12330 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12331 if (!BLOOM(sepmask, ch))
12332 break;
12333 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12334 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012336 }
12337
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340
Victor Stinner7931d9a2011-11-04 00:22:48 +010012341 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342}
12343
12344PyObject*
12345PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12346{
12347 unsigned char *data;
12348 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012349 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350
Victor Stinnerde636f32011-10-01 03:55:54 +020012351 if (PyUnicode_READY(self) == -1)
12352 return NULL;
12353
Victor Stinner684d5fd2012-05-03 02:32:34 +020012354 length = PyUnicode_GET_LENGTH(self);
12355 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012356
Victor Stinner684d5fd2012-05-03 02:32:34 +020012357 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012358 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359
Victor Stinnerde636f32011-10-01 03:55:54 +020012360 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012361 PyErr_SetString(PyExc_IndexError, "string index out of range");
12362 return NULL;
12363 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012364 if (start >= length || end < start)
12365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012366
Victor Stinner684d5fd2012-05-03 02:32:34 +020012367 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012368 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012369 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012370 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012371 }
12372 else {
12373 kind = PyUnicode_KIND(self);
12374 data = PyUnicode_1BYTE_DATA(self);
12375 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012376 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012377 length);
12378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
12381static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012382do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 Py_ssize_t len, i, j;
12385
12386 if (PyUnicode_READY(self) == -1)
12387 return NULL;
12388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012390
Victor Stinnercc7af722013-04-09 22:39:24 +020012391 if (PyUnicode_IS_ASCII(self)) {
12392 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12393
12394 i = 0;
12395 if (striptype != RIGHTSTRIP) {
12396 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012397 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012398 if (!_Py_ascii_whitespace[ch])
12399 break;
12400 i++;
12401 }
12402 }
12403
12404 j = len;
12405 if (striptype != LEFTSTRIP) {
12406 j--;
12407 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012408 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012409 if (!_Py_ascii_whitespace[ch])
12410 break;
12411 j--;
12412 }
12413 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414 }
12415 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012416 else {
12417 int kind = PyUnicode_KIND(self);
12418 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419
Victor Stinnercc7af722013-04-09 22:39:24 +020012420 i = 0;
12421 if (striptype != RIGHTSTRIP) {
12422 while (i < len) {
12423 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12424 if (!Py_UNICODE_ISSPACE(ch))
12425 break;
12426 i++;
12427 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012428 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012429
12430 j = len;
12431 if (striptype != LEFTSTRIP) {
12432 j--;
12433 while (j >= i) {
12434 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12435 if (!Py_UNICODE_ISSPACE(ch))
12436 break;
12437 j--;
12438 }
12439 j++;
12440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012441 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442
Victor Stinner7931d9a2011-11-04 00:22:48 +010012443 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444}
12445
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012446
12447static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012448do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 if (sep != NULL && sep != Py_None) {
12451 if (PyUnicode_Check(sep))
12452 return _PyUnicode_XStrip(self, striptype, sep);
12453 else {
12454 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 "%s arg must be None or str",
12456 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 return NULL;
12458 }
12459 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462}
12463
12464
INADA Naoki3ae20562017-01-16 20:41:20 +090012465/*[clinic input]
12466str.strip as unicode_strip
12467
12468 chars: object = None
12469 /
12470
Victor Stinner0c4a8282017-01-17 02:21:47 +010012471Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012472
12473If chars is given and not None, remove characters in chars instead.
12474[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475
12476static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012477unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012478/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479{
INADA Naoki3ae20562017-01-16 20:41:20 +090012480 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481}
12482
12483
INADA Naoki3ae20562017-01-16 20:41:20 +090012484/*[clinic input]
12485str.lstrip as unicode_lstrip
12486
12487 chars: object = NULL
12488 /
12489
12490Return a copy of the string with leading whitespace removed.
12491
12492If chars is given and not None, remove characters in chars instead.
12493[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494
12495static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012496unicode_lstrip_impl(PyObject *self, PyObject *chars)
12497/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498{
INADA Naoki3ae20562017-01-16 20:41:20 +090012499 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500}
12501
12502
INADA Naoki3ae20562017-01-16 20:41:20 +090012503/*[clinic input]
12504str.rstrip as unicode_rstrip
12505
12506 chars: object = NULL
12507 /
12508
12509Return a copy of the string with trailing whitespace removed.
12510
12511If chars is given and not None, remove characters in chars instead.
12512[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513
12514static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012515unicode_rstrip_impl(PyObject *self, PyObject *chars)
12516/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517{
INADA Naoki3ae20562017-01-16 20:41:20 +090012518 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012519}
12520
12521
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012525 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
Serhiy Storchaka05997252013-01-26 12:14:02 +020012528 if (len < 1)
12529 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Victor Stinnerc4b49542011-12-11 22:44:26 +010012531 /* no repeat, return original string */
12532 if (len == 1)
12533 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012534
Benjamin Petersonbac79492012-01-14 13:34:47 -050012535 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 return NULL;
12537
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012538 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012539 PyErr_SetString(PyExc_OverflowError,
12540 "repeated string is too long");
12541 return NULL;
12542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012544
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012545 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 if (!u)
12547 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012548 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (PyUnicode_GET_LENGTH(str) == 1) {
12551 const int kind = PyUnicode_KIND(str);
12552 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012553 if (kind == PyUnicode_1BYTE_KIND) {
12554 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012555 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012556 }
12557 else if (kind == PyUnicode_2BYTE_KIND) {
12558 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012559 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012560 ucs2[n] = fill_char;
12561 } else {
12562 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12563 assert(kind == PyUnicode_4BYTE_KIND);
12564 for (n = 0; n < len; ++n)
12565 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 }
12568 else {
12569 /* number of characters copied this far */
12570 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012571 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012573 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012577 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 }
12581
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012582 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012583 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
Alexander Belopolsky40018472011-02-26 01:02:56 +000012586PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012587PyUnicode_Replace(PyObject *str,
12588 PyObject *substr,
12589 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012590 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012592 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12593 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012595 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
INADA Naoki3ae20562017-01-16 20:41:20 +090012598/*[clinic input]
12599str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
INADA Naoki3ae20562017-01-16 20:41:20 +090012601 old: unicode
12602 new: unicode
12603 count: Py_ssize_t = -1
12604 Maximum number of occurrences to replace.
12605 -1 (the default value) means replace all occurrences.
12606 /
12607
12608Return a copy with all occurrences of substring old replaced by new.
12609
12610If the optional argument count is given, only the first count occurrences are
12611replaced.
12612[clinic start generated code]*/
12613
12614static PyObject *
12615unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12616 Py_ssize_t count)
12617/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012619 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012621 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622}
12623
Alexander Belopolsky40018472011-02-26 01:02:56 +000012624static PyObject *
12625unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 Py_ssize_t isize;
12629 Py_ssize_t osize, squote, dquote, i, o;
12630 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012631 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012635 return NULL;
12636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 isize = PyUnicode_GET_LENGTH(unicode);
12638 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 /* Compute length of output, quote characters, and
12641 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012642 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 max = 127;
12644 squote = dquote = 0;
12645 ikind = PyUnicode_KIND(unicode);
12646 for (i = 0; i < isize; i++) {
12647 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012648 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012650 case '\'': squote++; break;
12651 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 incr = 2;
12654 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 default:
12656 /* Fast-path ASCII */
12657 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 ;
12661 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012670 if (osize > PY_SSIZE_T_MAX - incr) {
12671 PyErr_SetString(PyExc_OverflowError,
12672 "string is too long to generate repr");
12673 return NULL;
12674 }
12675 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 }
12677
12678 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012679 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012681 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 if (dquote)
12683 /* Both squote and dquote present. Use squote,
12684 and escape them */
12685 osize += squote;
12686 else
12687 quote = '"';
12688 }
Victor Stinner55c08782013-04-14 18:45:39 +020012689 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690
12691 repr = PyUnicode_New(osize, max);
12692 if (repr == NULL)
12693 return NULL;
12694 okind = PyUnicode_KIND(repr);
12695 odata = PyUnicode_DATA(repr);
12696
12697 PyUnicode_WRITE(okind, odata, 0, quote);
12698 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012699 if (unchanged) {
12700 _PyUnicode_FastCopyCharacters(repr, 1,
12701 unicode, 0,
12702 isize);
12703 }
12704 else {
12705 for (i = 0, o = 1; i < isize; i++) {
12706 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707
Victor Stinner55c08782013-04-14 18:45:39 +020012708 /* Escape quotes and backslashes */
12709 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012710 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012712 continue;
12713 }
12714
12715 /* Map special whitespace to '\t', \n', '\r' */
12716 if (ch == '\t') {
12717 PyUnicode_WRITE(okind, odata, o++, '\\');
12718 PyUnicode_WRITE(okind, odata, o++, 't');
12719 }
12720 else if (ch == '\n') {
12721 PyUnicode_WRITE(okind, odata, o++, '\\');
12722 PyUnicode_WRITE(okind, odata, o++, 'n');
12723 }
12724 else if (ch == '\r') {
12725 PyUnicode_WRITE(okind, odata, o++, '\\');
12726 PyUnicode_WRITE(okind, odata, o++, 'r');
12727 }
12728
12729 /* Map non-printable US ASCII to '\xhh' */
12730 else if (ch < ' ' || ch == 0x7F) {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 'x');
12733 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12735 }
12736
12737 /* Copy ASCII characters as-is */
12738 else if (ch < 0x7F) {
12739 PyUnicode_WRITE(okind, odata, o++, ch);
12740 }
12741
12742 /* Non-ASCII characters */
12743 else {
12744 /* Map Unicode whitespace and control characters
12745 (categories Z* and C* except ASCII space)
12746 */
12747 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12748 PyUnicode_WRITE(okind, odata, o++, '\\');
12749 /* Map 8-bit characters to '\xhh' */
12750 if (ch <= 0xff) {
12751 PyUnicode_WRITE(okind, odata, o++, 'x');
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12754 }
12755 /* Map 16-bit characters to '\uxxxx' */
12756 else if (ch <= 0xffff) {
12757 PyUnicode_WRITE(okind, odata, o++, 'u');
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12762 }
12763 /* Map 21-bit characters to '\U00xxxxxx' */
12764 else {
12765 PyUnicode_WRITE(okind, odata, o++, 'U');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12774 }
12775 }
12776 /* Copy characters as-is */
12777 else {
12778 PyUnicode_WRITE(okind, odata, o++, ch);
12779 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012780 }
12781 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012784 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012785 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786}
12787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012788PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790\n\
12791Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012792such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793arguments start and end are interpreted as in slice notation.\n\
12794\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012795Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796
12797static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012800 /* initialize variables to prevent gcc warning */
12801 PyObject *substring = NULL;
12802 Py_ssize_t start = 0;
12803 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012806 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012809 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012812 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 if (result == -2)
12815 return NULL;
12816
Christian Heimes217cfd12007-12-02 14:31:20 +000012817 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818}
12819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012820PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012823Return the highest index in S where substring sub is found,\n\
12824such that sub is contained within S[start:end]. Optional\n\
12825arguments start and end are interpreted as in slice notation.\n\
12826\n\
12827Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012832 /* initialize variables to prevent gcc warning */
12833 PyObject *substring = NULL;
12834 Py_ssize_t start = 0;
12835 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012836 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012838 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012844 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 if (result == -2)
12847 return NULL;
12848
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849 if (result < 0) {
12850 PyErr_SetString(PyExc_ValueError, "substring not found");
12851 return NULL;
12852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853
Christian Heimes217cfd12007-12-02 14:31:20 +000012854 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
INADA Naoki3ae20562017-01-16 20:41:20 +090012857/*[clinic input]
12858str.rjust as unicode_rjust
12859
12860 width: Py_ssize_t
12861 fillchar: Py_UCS4 = ' '
12862 /
12863
12864Return a right-justified string of length width.
12865
12866Padding is done using the specified fill character (default is a space).
12867[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
12869static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012870unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12871/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012873 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874 return NULL;
12875
Victor Stinnerc4b49542011-12-11 22:44:26 +010012876 if (PyUnicode_GET_LENGTH(self) >= width)
12877 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880}
12881
Alexander Belopolsky40018472011-02-26 01:02:56 +000012882PyObject *
12883PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
INADA Naoki3ae20562017-01-16 20:41:20 +090012891/*[clinic input]
12892str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
INADA Naoki3ae20562017-01-16 20:41:20 +090012894 sep: object = None
12895 The delimiter according which to split the string.
12896 None (the default value) means split according to any whitespace,
12897 and discard empty strings from the result.
12898 maxsplit: Py_ssize_t = -1
12899 Maximum number of splits to do.
12900 -1 (the default value) means no limit.
12901
12902Return a list of the words in the string, using sep as the delimiter string.
12903[clinic start generated code]*/
12904
12905static PyObject *
12906unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12907/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908{
INADA Naoki3ae20562017-01-16 20:41:20 +090012909 if (sep == Py_None)
12910 return split(self, NULL, maxsplit);
12911 if (PyUnicode_Check(sep))
12912 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012913
Victor Stinner998b8062018-09-12 00:23:25 +020012914 PyErr_Format(PyExc_TypeError,
12915 "must be str or None, not %.100s",
12916 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918}
12919
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012921PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012922{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012924 int kind1, kind2;
12925 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
Victor Stinner14f8f022011-10-05 20:58:25 +020012931 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 len1 = PyUnicode_GET_LENGTH(str_obj);
12934 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012935 if (kind1 < kind2 || len1 < len2) {
12936 _Py_INCREF_UNICODE_EMPTY();
12937 if (!unicode_empty)
12938 out = NULL;
12939 else {
12940 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12941 Py_DECREF(unicode_empty);
12942 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012943 return out;
12944 }
12945 buf1 = PyUnicode_DATA(str_obj);
12946 buf2 = PyUnicode_DATA(sep_obj);
12947 if (kind2 != kind1) {
12948 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12949 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012950 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012953 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012955 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12956 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12957 else
12958 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 break;
12960 case PyUnicode_2BYTE_KIND:
12961 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 break;
12963 case PyUnicode_4BYTE_KIND:
12964 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012967 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012970 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972
12973 return out;
12974}
12975
12976
12977PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012978PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012981 int kind1, kind2;
12982 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012985 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012988 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 len1 = PyUnicode_GET_LENGTH(str_obj);
12991 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012992 if (kind1 < kind2 || len1 < len2) {
12993 _Py_INCREF_UNICODE_EMPTY();
12994 if (!unicode_empty)
12995 out = NULL;
12996 else {
12997 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12998 Py_DECREF(unicode_empty);
12999 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 return out;
13001 }
13002 buf1 = PyUnicode_DATA(str_obj);
13003 buf2 = PyUnicode_DATA(sep_obj);
13004 if (kind2 != kind1) {
13005 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13006 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013007 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013012 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13013 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13014 else
13015 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 break;
13017 case PyUnicode_2BYTE_KIND:
13018 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13019 break;
13020 case PyUnicode_4BYTE_KIND:
13021 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 break;
13023 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013024 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013026
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013027 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013029
13030 return out;
13031}
13032
INADA Naoki3ae20562017-01-16 20:41:20 +090013033/*[clinic input]
13034str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035
INADA Naoki3ae20562017-01-16 20:41:20 +090013036 sep: object
13037 /
13038
13039Partition the string into three parts using the given separator.
13040
13041This will search for the separator in the string. If the separator is found,
13042returns a 3-tuple containing the part before the separator, the separator
13043itself, and the part after it.
13044
13045If the separator is not found, returns a 3-tuple containing the original string
13046and two empty strings.
13047[clinic start generated code]*/
13048
13049static PyObject *
13050unicode_partition(PyObject *self, PyObject *sep)
13051/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052{
INADA Naoki3ae20562017-01-16 20:41:20 +090013053 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054}
13055
INADA Naoki3ae20562017-01-16 20:41:20 +090013056/*[clinic input]
13057str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013058
INADA Naoki3ae20562017-01-16 20:41:20 +090013059Partition the string into three parts using the given separator.
13060
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013061This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013062the separator is found, returns a 3-tuple containing the part before the
13063separator, the separator itself, and the part after it.
13064
13065If the separator is not found, returns a 3-tuple containing two empty strings
13066and the original string.
13067[clinic start generated code]*/
13068
13069static PyObject *
13070unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013071/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013072{
INADA Naoki3ae20562017-01-16 20:41:20 +090013073 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013074}
13075
Alexander Belopolsky40018472011-02-26 01:02:56 +000013076PyObject *
13077PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013078{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013079 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013081
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013083}
13084
INADA Naoki3ae20562017-01-16 20:41:20 +090013085/*[clinic input]
13086str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087
INADA Naoki3ae20562017-01-16 20:41:20 +090013088Return a list of the words in the string, using sep as the delimiter string.
13089
13090Splits are done starting at the end of the string and working to the front.
13091[clinic start generated code]*/
13092
13093static PyObject *
13094unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13095/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096{
INADA Naoki3ae20562017-01-16 20:41:20 +090013097 if (sep == Py_None)
13098 return rsplit(self, NULL, maxsplit);
13099 if (PyUnicode_Check(sep))
13100 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013101
Victor Stinner998b8062018-09-12 00:23:25 +020013102 PyErr_Format(PyExc_TypeError,
13103 "must be str or None, not %.100s",
13104 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013105 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013106}
13107
INADA Naoki3ae20562017-01-16 20:41:20 +090013108/*[clinic input]
13109str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013111 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013112
13113Return a list of the lines in the string, breaking at line boundaries.
13114
13115Line breaks are not included in the resulting list unless keepends is given and
13116true.
13117[clinic start generated code]*/
13118
13119static PyObject *
13120unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013121/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013123 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
13125
13126static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013127PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013129 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130}
13131
INADA Naoki3ae20562017-01-16 20:41:20 +090013132/*[clinic input]
13133str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
INADA Naoki3ae20562017-01-16 20:41:20 +090013135Convert uppercase characters to lowercase and lowercase characters to uppercase.
13136[clinic start generated code]*/
13137
13138static PyObject *
13139unicode_swapcase_impl(PyObject *self)
13140/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013142 if (PyUnicode_READY(self) == -1)
13143 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013144 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145}
13146
Larry Hastings61272b72014-01-07 12:41:53 -080013147/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013148
Larry Hastings31826802013-10-19 00:09:25 -070013149@staticmethod
13150str.maketrans as unicode_maketrans
13151
13152 x: object
13153
13154 y: unicode=NULL
13155
13156 z: unicode=NULL
13157
13158 /
13159
13160Return a translation table usable for str.translate().
13161
13162If there is only one argument, it must be a dictionary mapping Unicode
13163ordinals (integers) or characters to Unicode ordinals, strings or None.
13164Character keys will be then converted to ordinals.
13165If there are two arguments, they must be strings of equal length, and
13166in the resulting dictionary, each character in x will be mapped to the
13167character at the same position in y. If there is a third argument, it
13168must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013169[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013170
Larry Hastings31826802013-10-19 00:09:25 -070013171static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013172unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013173/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013174{
Georg Brandlceee0772007-11-27 23:48:05 +000013175 PyObject *new = NULL, *key, *value;
13176 Py_ssize_t i = 0;
13177 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013178
Georg Brandlceee0772007-11-27 23:48:05 +000013179 new = PyDict_New();
13180 if (!new)
13181 return NULL;
13182 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 int x_kind, y_kind, z_kind;
13184 void *x_data, *y_data, *z_data;
13185
Georg Brandlceee0772007-11-27 23:48:05 +000013186 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013187 if (!PyUnicode_Check(x)) {
13188 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13189 "be a string if there is a second argument");
13190 goto err;
13191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013193 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13194 "arguments must have equal length");
13195 goto err;
13196 }
13197 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 x_kind = PyUnicode_KIND(x);
13199 y_kind = PyUnicode_KIND(y);
13200 x_data = PyUnicode_DATA(x);
13201 y_data = PyUnicode_DATA(y);
13202 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13203 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013204 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013205 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013206 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013207 if (!value) {
13208 Py_DECREF(key);
13209 goto err;
13210 }
Georg Brandlceee0772007-11-27 23:48:05 +000013211 res = PyDict_SetItem(new, key, value);
13212 Py_DECREF(key);
13213 Py_DECREF(value);
13214 if (res < 0)
13215 goto err;
13216 }
13217 /* create entries for deleting chars in z */
13218 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 z_kind = PyUnicode_KIND(z);
13220 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013221 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013223 if (!key)
13224 goto err;
13225 res = PyDict_SetItem(new, key, Py_None);
13226 Py_DECREF(key);
13227 if (res < 0)
13228 goto err;
13229 }
13230 }
13231 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 int kind;
13233 void *data;
13234
Georg Brandlceee0772007-11-27 23:48:05 +000013235 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013236 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013237 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13238 "to maketrans it must be a dict");
13239 goto err;
13240 }
13241 /* copy entries into the new dict, converting string keys to int keys */
13242 while (PyDict_Next(x, &i, &key, &value)) {
13243 if (PyUnicode_Check(key)) {
13244 /* convert string keys to integer keys */
13245 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013246 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013247 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13248 "table must be of length 1");
13249 goto err;
13250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 kind = PyUnicode_KIND(key);
13252 data = PyUnicode_DATA(key);
13253 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013254 if (!newkey)
13255 goto err;
13256 res = PyDict_SetItem(new, newkey, value);
13257 Py_DECREF(newkey);
13258 if (res < 0)
13259 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013260 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013261 /* just keep integer keys */
13262 if (PyDict_SetItem(new, key, value) < 0)
13263 goto err;
13264 } else {
13265 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13266 "be strings or integers");
13267 goto err;
13268 }
13269 }
13270 }
13271 return new;
13272 err:
13273 Py_DECREF(new);
13274 return NULL;
13275}
13276
INADA Naoki3ae20562017-01-16 20:41:20 +090013277/*[clinic input]
13278str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280 table: object
13281 Translation table, which must be a mapping of Unicode ordinals to
13282 Unicode ordinals, strings, or None.
13283 /
13284
13285Replace each character in the string using the given translation table.
13286
13287The table must implement lookup/indexing via __getitem__, for instance a
13288dictionary or list. If this operation raises LookupError, the character is
13289left untouched. Characters mapped to None are deleted.
13290[clinic start generated code]*/
13291
13292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013294/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297}
13298
INADA Naoki3ae20562017-01-16 20:41:20 +090013299/*[clinic input]
13300str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
INADA Naoki3ae20562017-01-16 20:41:20 +090013302Return a copy of the string converted to uppercase.
13303[clinic start generated code]*/
13304
13305static PyObject *
13306unicode_upper_impl(PyObject *self)
13307/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013309 if (PyUnicode_READY(self) == -1)
13310 return NULL;
13311 if (PyUnicode_IS_ASCII(self))
13312 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013313 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314}
13315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316/*[clinic input]
13317str.zfill as unicode_zfill
13318
13319 width: Py_ssize_t
13320 /
13321
13322Pad a numeric string with zeros on the left, to fill a field of the given width.
13323
13324The string is never truncated.
13325[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
13327static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013328unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013329/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013331 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013332 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 int kind;
13334 void *data;
13335 Py_UCS4 chr;
13336
Benjamin Petersonbac79492012-01-14 13:34:47 -050013337 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339
Victor Stinnerc4b49542011-12-11 22:44:26 +010013340 if (PyUnicode_GET_LENGTH(self) >= width)
13341 return unicode_result_unchanged(self);
13342
13343 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
13345 u = pad(self, fill, 0, '0');
13346
Walter Dörwald068325e2002-04-15 13:36:47 +000013347 if (u == NULL)
13348 return NULL;
13349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 kind = PyUnicode_KIND(u);
13351 data = PyUnicode_DATA(u);
13352 chr = PyUnicode_READ(kind, data, fill);
13353
13354 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 PyUnicode_WRITE(kind, data, 0, chr);
13357 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358 }
13359
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013360 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013361 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363
13364#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013365static PyObject *
13366unicode__decimal2ascii(PyObject *self)
13367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013369}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370#endif
13371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013372PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013375Return True if S starts with the specified prefix, False otherwise.\n\
13376With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013377With optional end, stop comparing S at that position.\n\
13378prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379
13380static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013381unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013385 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013386 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013387 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013388 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389
Jesus Ceaac451502011-04-20 17:09:23 +020013390 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392 if (PyTuple_Check(subobj)) {
13393 Py_ssize_t i;
13394 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013395 substring = PyTuple_GET_ITEM(subobj, i);
13396 if (!PyUnicode_Check(substring)) {
13397 PyErr_Format(PyExc_TypeError,
13398 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013399 "not %.100s",
13400 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013404 if (result == -1)
13405 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 if (result) {
13407 Py_RETURN_TRUE;
13408 }
13409 }
13410 /* nothing matched */
13411 Py_RETURN_FALSE;
13412 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013413 if (!PyUnicode_Check(subobj)) {
13414 PyErr_Format(PyExc_TypeError,
13415 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013416 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013418 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013419 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013420 if (result == -1)
13421 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013422 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423}
13424
13425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013426PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013429Return True if S ends with the specified suffix, False otherwise.\n\
13430With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013431With optional end, stop comparing S at that position.\n\
13432suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433
13434static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013435unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013439 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013441 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013442 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443
Jesus Ceaac451502011-04-20 17:09:23 +020013444 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013446 if (PyTuple_Check(subobj)) {
13447 Py_ssize_t i;
13448 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013449 substring = PyTuple_GET_ITEM(subobj, i);
13450 if (!PyUnicode_Check(substring)) {
13451 PyErr_Format(PyExc_TypeError,
13452 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013453 "not %.100s",
13454 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013456 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013458 if (result == -1)
13459 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460 if (result) {
13461 Py_RETURN_TRUE;
13462 }
13463 }
13464 Py_RETURN_FALSE;
13465 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013466 if (!PyUnicode_Check(subobj)) {
13467 PyErr_Format(PyExc_TypeError,
13468 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013469 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013471 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013472 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013473 if (result == -1)
13474 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476}
13477
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013478static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013479_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013480{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013481 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13482 writer->data = PyUnicode_DATA(writer->buffer);
13483
13484 if (!writer->readonly) {
13485 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013486 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013487 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13490 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13491 writer->kind = PyUnicode_WCHAR_KIND;
13492 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13493
Victor Stinner8f674cc2013-04-17 23:02:17 +020013494 /* Copy-on-write mode: set buffer size to 0 so
13495 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13496 * next write. */
13497 writer->size = 0;
13498 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013499}
13500
Victor Stinnerd3f08822012-05-29 12:57:52 +020013501void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013503{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013505
13506 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013507 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013508
13509 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13510 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13511 writer->kind = PyUnicode_WCHAR_KIND;
13512 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013513}
13514
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515int
13516_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13517 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013518{
13519 Py_ssize_t newlen;
13520 PyObject *newbuffer;
13521
Victor Stinner2740e462016-09-06 16:58:36 -070013522 assert(maxchar <= MAX_UNICODE);
13523
Victor Stinnerca9381e2015-09-22 00:58:32 +020013524 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013525 assert((maxchar > writer->maxchar && length >= 0)
13526 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527
Victor Stinner202fdca2012-05-07 12:47:02 +020013528 if (length > PY_SSIZE_T_MAX - writer->pos) {
13529 PyErr_NoMemory();
13530 return -1;
13531 }
13532 newlen = writer->pos + length;
13533
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013534 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013535
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013538 if (writer->overallocate
13539 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13540 /* overallocate to limit the number of realloc() */
13541 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013543 if (newlen < writer->min_length)
13544 newlen = writer->min_length;
13545
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 writer->buffer = PyUnicode_New(newlen, maxchar);
13547 if (writer->buffer == NULL)
13548 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013550 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013551 if (writer->overallocate
13552 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13553 /* overallocate to limit the number of realloc() */
13554 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013556 if (newlen < writer->min_length)
13557 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013559 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013560 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013561 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 newbuffer = PyUnicode_New(newlen, maxchar);
13563 if (newbuffer == NULL)
13564 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13566 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013567 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013568 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 }
13570 else {
13571 newbuffer = resize_compact(writer->buffer, newlen);
13572 if (newbuffer == NULL)
13573 return -1;
13574 }
13575 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 }
13577 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013578 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 newbuffer = PyUnicode_New(writer->size, maxchar);
13580 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013581 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013582 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13583 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013584 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013586 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013587 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013588
13589#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013590}
13591
Victor Stinnerca9381e2015-09-22 00:58:32 +020013592int
13593_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13594 enum PyUnicode_Kind kind)
13595{
13596 Py_UCS4 maxchar;
13597
13598 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13599 assert(writer->kind < kind);
13600
13601 switch (kind)
13602 {
13603 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13604 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13605 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13606 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013607 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013608 }
13609
13610 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13611}
13612
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013613static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013614_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013615{
Victor Stinner2740e462016-09-06 16:58:36 -070013616 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013617 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13618 return -1;
13619 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13620 writer->pos++;
13621 return 0;
13622}
13623
13624int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013625_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13626{
13627 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13628}
13629
13630int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13632{
13633 Py_UCS4 maxchar;
13634 Py_ssize_t len;
13635
13636 if (PyUnicode_READY(str) == -1)
13637 return -1;
13638 len = PyUnicode_GET_LENGTH(str);
13639 if (len == 0)
13640 return 0;
13641 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13642 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013643 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013644 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013645 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013646 Py_INCREF(str);
13647 writer->buffer = str;
13648 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 writer->pos += len;
13650 return 0;
13651 }
13652 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13653 return -1;
13654 }
13655 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13656 str, 0, len);
13657 writer->pos += len;
13658 return 0;
13659}
13660
Victor Stinnere215d962012-10-06 23:03:36 +020013661int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013662_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13663 Py_ssize_t start, Py_ssize_t end)
13664{
13665 Py_UCS4 maxchar;
13666 Py_ssize_t len;
13667
13668 if (PyUnicode_READY(str) == -1)
13669 return -1;
13670
13671 assert(0 <= start);
13672 assert(end <= PyUnicode_GET_LENGTH(str));
13673 assert(start <= end);
13674
13675 if (end == 0)
13676 return 0;
13677
13678 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13679 return _PyUnicodeWriter_WriteStr(writer, str);
13680
13681 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13682 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13683 else
13684 maxchar = writer->maxchar;
13685 len = end - start;
13686
13687 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13688 return -1;
13689
13690 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13691 str, start, len);
13692 writer->pos += len;
13693 return 0;
13694}
13695
13696int
Victor Stinner4a587072013-11-19 12:54:53 +010013697_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13698 const char *ascii, Py_ssize_t len)
13699{
13700 if (len == -1)
13701 len = strlen(ascii);
13702
13703 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13704
13705 if (writer->buffer == NULL && !writer->overallocate) {
13706 PyObject *str;
13707
13708 str = _PyUnicode_FromASCII(ascii, len);
13709 if (str == NULL)
13710 return -1;
13711
13712 writer->readonly = 1;
13713 writer->buffer = str;
13714 _PyUnicodeWriter_Update(writer);
13715 writer->pos += len;
13716 return 0;
13717 }
13718
13719 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13720 return -1;
13721
13722 switch (writer->kind)
13723 {
13724 case PyUnicode_1BYTE_KIND:
13725 {
13726 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13727 Py_UCS1 *data = writer->data;
13728
Christian Heimesf051e432016-09-13 20:22:02 +020013729 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013730 break;
13731 }
13732 case PyUnicode_2BYTE_KIND:
13733 {
13734 _PyUnicode_CONVERT_BYTES(
13735 Py_UCS1, Py_UCS2,
13736 ascii, ascii + len,
13737 (Py_UCS2 *)writer->data + writer->pos);
13738 break;
13739 }
13740 case PyUnicode_4BYTE_KIND:
13741 {
13742 _PyUnicode_CONVERT_BYTES(
13743 Py_UCS1, Py_UCS4,
13744 ascii, ascii + len,
13745 (Py_UCS4 *)writer->data + writer->pos);
13746 break;
13747 }
13748 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013749 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013750 }
13751
13752 writer->pos += len;
13753 return 0;
13754}
13755
13756int
13757_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13758 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013759{
13760 Py_UCS4 maxchar;
13761
13762 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13763 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13764 return -1;
13765 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13766 writer->pos += len;
13767 return 0;
13768}
13769
Victor Stinnerd3f08822012-05-29 12:57:52 +020013770PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013771_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013772{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013773 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013774
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013776 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013777 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013779
13780 str = writer->buffer;
13781 writer->buffer = NULL;
13782
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013783 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013784 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13785 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013787
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013788 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13789 PyObject *str2;
13790 str2 = resize_compact(str, writer->pos);
13791 if (str2 == NULL) {
13792 Py_DECREF(str);
13793 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013794 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013795 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013796 }
13797
Victor Stinner15a0bd32013-07-08 22:29:55 +020013798 assert(_PyUnicode_CheckConsistency(str, 1));
13799 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013800}
13801
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013803_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013804{
13805 Py_CLEAR(writer->buffer);
13806}
13807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013809
13810PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013812\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013813Return a formatted version of S, using substitutions from args and kwargs.\n\
13814The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013815
Eric Smith27bbca62010-11-04 17:06:58 +000013816PyDoc_STRVAR(format_map__doc__,
13817 "S.format_map(mapping) -> str\n\
13818\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013819Return a formatted version of S, using substitutions from mapping.\n\
13820The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013821
INADA Naoki3ae20562017-01-16 20:41:20 +090013822/*[clinic input]
13823str.__format__ as unicode___format__
13824
13825 format_spec: unicode
13826 /
13827
13828Return a formatted version of the string as described by format_spec.
13829[clinic start generated code]*/
13830
Eric Smith4a7d76d2008-05-30 18:10:19 +000013831static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013832unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013833/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013834{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 _PyUnicodeWriter writer;
13836 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013837
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 if (PyUnicode_READY(self) == -1)
13839 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013840 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13842 self, format_spec, 0,
13843 PyUnicode_GET_LENGTH(format_spec));
13844 if (ret == -1) {
13845 _PyUnicodeWriter_Dealloc(&writer);
13846 return NULL;
13847 }
13848 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013849}
13850
INADA Naoki3ae20562017-01-16 20:41:20 +090013851/*[clinic input]
13852str.__sizeof__ as unicode_sizeof
13853
13854Return the size of the string in memory, in bytes.
13855[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013856
13857static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013858unicode_sizeof_impl(PyObject *self)
13859/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 Py_ssize_t size;
13862
13863 /* If it's a compact object, account for base structure +
13864 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 if (PyUnicode_IS_COMPACT_ASCII(self))
13866 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13867 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013869 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 else {
13871 /* If it is a two-block object, account for base object, and
13872 for character block if present. */
13873 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013874 if (_PyUnicode_DATA_ANY(self))
13875 size += (PyUnicode_GET_LENGTH(self) + 1) *
13876 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 }
13878 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013879 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13881 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13882 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13883 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884
13885 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013886}
13887
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013888static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013889unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013890{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013891 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 if (!copy)
13893 return NULL;
13894 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013895}
13896
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 UNICODE_ENCODE_METHODDEF
13899 UNICODE_REPLACE_METHODDEF
13900 UNICODE_SPLIT_METHODDEF
13901 UNICODE_RSPLIT_METHODDEF
13902 UNICODE_JOIN_METHODDEF
13903 UNICODE_CAPITALIZE_METHODDEF
13904 UNICODE_CASEFOLD_METHODDEF
13905 UNICODE_TITLE_METHODDEF
13906 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013911 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013912 UNICODE_LJUST_METHODDEF
13913 UNICODE_LOWER_METHODDEF
13914 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013915 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13916 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013917 UNICODE_RJUST_METHODDEF
13918 UNICODE_RSTRIP_METHODDEF
13919 UNICODE_RPARTITION_METHODDEF
13920 UNICODE_SPLITLINES_METHODDEF
13921 UNICODE_STRIP_METHODDEF
13922 UNICODE_SWAPCASE_METHODDEF
13923 UNICODE_TRANSLATE_METHODDEF
13924 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13926 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013927 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013928 UNICODE_ISLOWER_METHODDEF
13929 UNICODE_ISUPPER_METHODDEF
13930 UNICODE_ISTITLE_METHODDEF
13931 UNICODE_ISSPACE_METHODDEF
13932 UNICODE_ISDECIMAL_METHODDEF
13933 UNICODE_ISDIGIT_METHODDEF
13934 UNICODE_ISNUMERIC_METHODDEF
13935 UNICODE_ISALPHA_METHODDEF
13936 UNICODE_ISALNUM_METHODDEF
13937 UNICODE_ISIDENTIFIER_METHODDEF
13938 UNICODE_ISPRINTABLE_METHODDEF
13939 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013940 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013941 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013942 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013943 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013945#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013946 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013947 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948#endif
13949
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013950 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951 {NULL, NULL}
13952};
13953
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013954static PyObject *
13955unicode_mod(PyObject *v, PyObject *w)
13956{
Brian Curtindfc80e32011-08-10 20:28:54 -050013957 if (!PyUnicode_Check(v))
13958 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013959 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013960}
13961
13962static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 0, /*nb_add*/
13964 0, /*nb_subtract*/
13965 0, /*nb_multiply*/
13966 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013967};
13968
Guido van Rossumd57fd912000-03-10 22:53:23 +000013969static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 (lenfunc) unicode_length, /* sq_length */
13971 PyUnicode_Concat, /* sq_concat */
13972 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13973 (ssizeargfunc) unicode_getitem, /* sq_item */
13974 0, /* sq_slice */
13975 0, /* sq_ass_item */
13976 0, /* sq_ass_slice */
13977 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978};
13979
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013981unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013983 if (PyUnicode_READY(self) == -1)
13984 return NULL;
13985
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013986 if (PyIndex_Check(item)) {
13987 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988 if (i == -1 && PyErr_Occurred())
13989 return NULL;
13990 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013991 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013993 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013994 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013995 PyObject *result;
13996 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013997 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013998 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013999
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014000 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014001 return NULL;
14002 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014003 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14004 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014005
14006 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014007 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014008 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014009 slicelength == PyUnicode_GET_LENGTH(self)) {
14010 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014011 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014012 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014013 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014014 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014015 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014016 src_kind = PyUnicode_KIND(self);
14017 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014018 if (!PyUnicode_IS_ASCII(self)) {
14019 kind_limit = kind_maxchar_limit(src_kind);
14020 max_char = 0;
14021 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14022 ch = PyUnicode_READ(src_kind, src_data, cur);
14023 if (ch > max_char) {
14024 max_char = ch;
14025 if (max_char >= kind_limit)
14026 break;
14027 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014028 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014029 }
Victor Stinner55c99112011-10-13 01:17:06 +020014030 else
14031 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014033 if (result == NULL)
14034 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014035 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014036 dest_data = PyUnicode_DATA(result);
14037
14038 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014039 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14040 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014041 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014042 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014043 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014044 } else {
14045 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14046 return NULL;
14047 }
14048}
14049
14050static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 (lenfunc)unicode_length, /* mp_length */
14052 (binaryfunc)unicode_subscript, /* mp_subscript */
14053 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014054};
14055
Guido van Rossumd57fd912000-03-10 22:53:23 +000014056
Guido van Rossumd57fd912000-03-10 22:53:23 +000014057/* Helpers for PyUnicode_Format() */
14058
Victor Stinnera47082312012-10-04 02:19:54 +020014059struct unicode_formatter_t {
14060 PyObject *args;
14061 int args_owned;
14062 Py_ssize_t arglen, argidx;
14063 PyObject *dict;
14064
14065 enum PyUnicode_Kind fmtkind;
14066 Py_ssize_t fmtcnt, fmtpos;
14067 void *fmtdata;
14068 PyObject *fmtstr;
14069
14070 _PyUnicodeWriter writer;
14071};
14072
14073struct unicode_format_arg_t {
14074 Py_UCS4 ch;
14075 int flags;
14076 Py_ssize_t width;
14077 int prec;
14078 int sign;
14079};
14080
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014082unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014083{
Victor Stinnera47082312012-10-04 02:19:54 +020014084 Py_ssize_t argidx = ctx->argidx;
14085
14086 if (argidx < ctx->arglen) {
14087 ctx->argidx++;
14088 if (ctx->arglen < 0)
14089 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014090 else
Victor Stinnera47082312012-10-04 02:19:54 +020014091 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092 }
14093 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014094 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 return NULL;
14096}
14097
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014098/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099
Victor Stinnera47082312012-10-04 02:19:54 +020014100/* Format a float into the writer if the writer is not NULL, or into *p_output
14101 otherwise.
14102
14103 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014104static int
Victor Stinnera47082312012-10-04 02:19:54 +020014105formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14106 PyObject **p_output,
14107 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014109 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014111 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014112 int prec;
14113 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014114
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115 x = PyFloat_AsDouble(v);
14116 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014117 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014118
Victor Stinnera47082312012-10-04 02:19:54 +020014119 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014120 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014121 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014122
Victor Stinnera47082312012-10-04 02:19:54 +020014123 if (arg->flags & F_ALT)
14124 dtoa_flags = Py_DTSF_ALT;
14125 else
14126 dtoa_flags = 0;
14127 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014128 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014129 return -1;
14130 len = strlen(p);
14131 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014132 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014133 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014134 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014135 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014136 }
14137 else
14138 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014139 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014140 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014141}
14142
Victor Stinnerd0880d52012-04-27 23:40:13 +020014143/* formatlong() emulates the format codes d, u, o, x and X, and
14144 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14145 * Python's regular ints.
14146 * Return value: a new PyUnicodeObject*, or NULL if error.
14147 * The output string is of the form
14148 * "-"? ("0x" | "0X")? digit+
14149 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14150 * set in flags. The case of hex digits will be correct,
14151 * There will be at least prec digits, zero-filled on the left if
14152 * necessary to get that many.
14153 * val object to be converted
14154 * flags bitmask of format flags; only F_ALT is looked at
14155 * prec minimum number of digits; 0-fill on left if needed
14156 * type a character in [duoxX]; u acts the same as d
14157 *
14158 * CAUTION: o, x and X conversions on regular ints can never
14159 * produce a '-' sign, but can for Python's unbounded ints.
14160 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014161PyObject *
14162_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014163{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014164 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014166 Py_ssize_t i;
14167 int sign; /* 1 if '-', else 0 */
14168 int len; /* number of characters */
14169 Py_ssize_t llen;
14170 int numdigits; /* len == numnondigits + numdigits */
14171 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014172
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 /* Avoid exceeding SSIZE_T_MAX */
14174 if (prec > INT_MAX-3) {
14175 PyErr_SetString(PyExc_OverflowError,
14176 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014178 }
14179
14180 assert(PyLong_Check(val));
14181
14182 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014183 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014184 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014185 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014186 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014187 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014188 /* int and int subclasses should print numerically when a numeric */
14189 /* format code is used (see issue18780) */
14190 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014191 break;
14192 case 'o':
14193 numnondigits = 2;
14194 result = PyNumber_ToBase(val, 8);
14195 break;
14196 case 'x':
14197 case 'X':
14198 numnondigits = 2;
14199 result = PyNumber_ToBase(val, 16);
14200 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 }
14202 if (!result)
14203 return NULL;
14204
14205 assert(unicode_modifiable(result));
14206 assert(PyUnicode_IS_READY(result));
14207 assert(PyUnicode_IS_ASCII(result));
14208
14209 /* To modify the string in-place, there can only be one reference. */
14210 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014211 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014212 PyErr_BadInternalCall();
14213 return NULL;
14214 }
14215 buf = PyUnicode_DATA(result);
14216 llen = PyUnicode_GET_LENGTH(result);
14217 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014218 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014219 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014220 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014221 return NULL;
14222 }
14223 len = (int)llen;
14224 sign = buf[0] == '-';
14225 numnondigits += sign;
14226 numdigits = len - numnondigits;
14227 assert(numdigits > 0);
14228
14229 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014230 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014231 (type == 'o' || type == 'x' || type == 'X'))) {
14232 assert(buf[sign] == '0');
14233 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14234 buf[sign+1] == 'o');
14235 numnondigits -= 2;
14236 buf += 2;
14237 len -= 2;
14238 if (sign)
14239 buf[0] = '-';
14240 assert(len == numnondigits + numdigits);
14241 assert(numdigits > 0);
14242 }
14243
14244 /* Fill with leading zeroes to meet minimum width. */
14245 if (prec > numdigits) {
14246 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14247 numnondigits + prec);
14248 char *b1;
14249 if (!r1) {
14250 Py_DECREF(result);
14251 return NULL;
14252 }
14253 b1 = PyBytes_AS_STRING(r1);
14254 for (i = 0; i < numnondigits; ++i)
14255 *b1++ = *buf++;
14256 for (i = 0; i < prec - numdigits; i++)
14257 *b1++ = '0';
14258 for (i = 0; i < numdigits; i++)
14259 *b1++ = *buf++;
14260 *b1 = '\0';
14261 Py_DECREF(result);
14262 result = r1;
14263 buf = PyBytes_AS_STRING(result);
14264 len = numnondigits + prec;
14265 }
14266
14267 /* Fix up case for hex conversions. */
14268 if (type == 'X') {
14269 /* Need to convert all lower case letters to upper case.
14270 and need to convert 0x to 0X (and -0x to -0X). */
14271 for (i = 0; i < len; i++)
14272 if (buf[i] >= 'a' && buf[i] <= 'x')
14273 buf[i] -= 'a'-'A';
14274 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014275 if (!PyUnicode_Check(result)
14276 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014277 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014278 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014279 Py_DECREF(result);
14280 result = unicode;
14281 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014282 else if (len != PyUnicode_GET_LENGTH(result)) {
14283 if (PyUnicode_Resize(&result, len) < 0)
14284 Py_CLEAR(result);
14285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014287}
14288
Ethan Furmandf3ed242014-01-05 06:50:30 -080014289/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014291 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014292 * -1 and raise an exception on error */
14293static int
Victor Stinnera47082312012-10-04 02:19:54 +020014294mainformatlong(PyObject *v,
14295 struct unicode_format_arg_t *arg,
14296 PyObject **p_output,
14297 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298{
14299 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014300 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301
14302 if (!PyNumber_Check(v))
14303 goto wrongtype;
14304
Ethan Furman9ab74802014-03-21 06:38:46 -070014305 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014307 if (type == 'o' || type == 'x' || type == 'X') {
14308 iobj = PyNumber_Index(v);
14309 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014310 if (PyErr_ExceptionMatches(PyExc_TypeError))
14311 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014312 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014313 }
14314 }
14315 else {
14316 iobj = PyNumber_Long(v);
14317 if (iobj == NULL ) {
14318 if (PyErr_ExceptionMatches(PyExc_TypeError))
14319 goto wrongtype;
14320 return -1;
14321 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 }
14323 assert(PyLong_Check(iobj));
14324 }
14325 else {
14326 iobj = v;
14327 Py_INCREF(iobj);
14328 }
14329
14330 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014331 && arg->width == -1 && arg->prec == -1
14332 && !(arg->flags & (F_SIGN | F_BLANK))
14333 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014334 {
14335 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014336 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 int base;
14338
Victor Stinnera47082312012-10-04 02:19:54 +020014339 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014340 {
14341 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014342 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 case 'd':
14344 case 'i':
14345 case 'u':
14346 base = 10;
14347 break;
14348 case 'o':
14349 base = 8;
14350 break;
14351 case 'x':
14352 case 'X':
14353 base = 16;
14354 break;
14355 }
14356
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014357 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14358 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014359 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014360 }
14361 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014362 return 1;
14363 }
14364
Ethan Furmanb95b5612015-01-23 20:05:18 -080014365 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014366 Py_DECREF(iobj);
14367 if (res == NULL)
14368 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014369 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014370 return 0;
14371
14372wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014373 switch(type)
14374 {
14375 case 'o':
14376 case 'x':
14377 case 'X':
14378 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014379 "%%%c format: an integer is required, "
14380 "not %.200s",
14381 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014382 break;
14383 default:
14384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014385 "%%%c format: a number is required, "
14386 "not %.200s",
14387 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014388 break;
14389 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014390 return -1;
14391}
14392
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014393static Py_UCS4
14394formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014395{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014396 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014397 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014398 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014399 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014401 goto onError;
14402 }
14403 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014404 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014405 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014406 /* make sure number is a type of integer */
14407 if (!PyLong_Check(v)) {
14408 iobj = PyNumber_Index(v);
14409 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014410 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014411 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014412 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014413 Py_DECREF(iobj);
14414 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014415 else {
14416 x = PyLong_AsLong(v);
14417 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014418 if (x == -1 && PyErr_Occurred())
14419 goto onError;
14420
Victor Stinner8faf8212011-12-08 22:14:11 +010014421 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014422 PyErr_SetString(PyExc_OverflowError,
14423 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014424 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 }
14426
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014427 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014429
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014431 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014432 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014433 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434}
14435
Victor Stinnera47082312012-10-04 02:19:54 +020014436/* Parse options of an argument: flags, width, precision.
14437 Handle also "%(name)" syntax.
14438
14439 Return 0 if the argument has been formatted into arg->str.
14440 Return 1 if the argument has been written into ctx->writer,
14441 Raise an exception and return -1 on error. */
14442static int
14443unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14444 struct unicode_format_arg_t *arg)
14445{
14446#define FORMAT_READ(ctx) \
14447 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14448
14449 PyObject *v;
14450
Victor Stinnera47082312012-10-04 02:19:54 +020014451 if (arg->ch == '(') {
14452 /* Get argument value from a dictionary. Example: "%(name)s". */
14453 Py_ssize_t keystart;
14454 Py_ssize_t keylen;
14455 PyObject *key;
14456 int pcount = 1;
14457
14458 if (ctx->dict == NULL) {
14459 PyErr_SetString(PyExc_TypeError,
14460 "format requires a mapping");
14461 return -1;
14462 }
14463 ++ctx->fmtpos;
14464 --ctx->fmtcnt;
14465 keystart = ctx->fmtpos;
14466 /* Skip over balanced parentheses */
14467 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14468 arg->ch = FORMAT_READ(ctx);
14469 if (arg->ch == ')')
14470 --pcount;
14471 else if (arg->ch == '(')
14472 ++pcount;
14473 ctx->fmtpos++;
14474 }
14475 keylen = ctx->fmtpos - keystart - 1;
14476 if (ctx->fmtcnt < 0 || pcount > 0) {
14477 PyErr_SetString(PyExc_ValueError,
14478 "incomplete format key");
14479 return -1;
14480 }
14481 key = PyUnicode_Substring(ctx->fmtstr,
14482 keystart, keystart + keylen);
14483 if (key == NULL)
14484 return -1;
14485 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014486 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014487 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014488 }
14489 ctx->args = PyObject_GetItem(ctx->dict, key);
14490 Py_DECREF(key);
14491 if (ctx->args == NULL)
14492 return -1;
14493 ctx->args_owned = 1;
14494 ctx->arglen = -1;
14495 ctx->argidx = -2;
14496 }
14497
14498 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014499 while (--ctx->fmtcnt >= 0) {
14500 arg->ch = FORMAT_READ(ctx);
14501 ctx->fmtpos++;
14502 switch (arg->ch) {
14503 case '-': arg->flags |= F_LJUST; continue;
14504 case '+': arg->flags |= F_SIGN; continue;
14505 case ' ': arg->flags |= F_BLANK; continue;
14506 case '#': arg->flags |= F_ALT; continue;
14507 case '0': arg->flags |= F_ZERO; continue;
14508 }
14509 break;
14510 }
14511
14512 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014513 if (arg->ch == '*') {
14514 v = unicode_format_getnextarg(ctx);
14515 if (v == NULL)
14516 return -1;
14517 if (!PyLong_Check(v)) {
14518 PyErr_SetString(PyExc_TypeError,
14519 "* wants int");
14520 return -1;
14521 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014522 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014523 if (arg->width == -1 && PyErr_Occurred())
14524 return -1;
14525 if (arg->width < 0) {
14526 arg->flags |= F_LJUST;
14527 arg->width = -arg->width;
14528 }
14529 if (--ctx->fmtcnt >= 0) {
14530 arg->ch = FORMAT_READ(ctx);
14531 ctx->fmtpos++;
14532 }
14533 }
14534 else if (arg->ch >= '0' && arg->ch <= '9') {
14535 arg->width = arg->ch - '0';
14536 while (--ctx->fmtcnt >= 0) {
14537 arg->ch = FORMAT_READ(ctx);
14538 ctx->fmtpos++;
14539 if (arg->ch < '0' || arg->ch > '9')
14540 break;
14541 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14542 mixing signed and unsigned comparison. Since arg->ch is between
14543 '0' and '9', casting to int is safe. */
14544 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14545 PyErr_SetString(PyExc_ValueError,
14546 "width too big");
14547 return -1;
14548 }
14549 arg->width = arg->width*10 + (arg->ch - '0');
14550 }
14551 }
14552
14553 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014554 if (arg->ch == '.') {
14555 arg->prec = 0;
14556 if (--ctx->fmtcnt >= 0) {
14557 arg->ch = FORMAT_READ(ctx);
14558 ctx->fmtpos++;
14559 }
14560 if (arg->ch == '*') {
14561 v = unicode_format_getnextarg(ctx);
14562 if (v == NULL)
14563 return -1;
14564 if (!PyLong_Check(v)) {
14565 PyErr_SetString(PyExc_TypeError,
14566 "* wants int");
14567 return -1;
14568 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014569 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014570 if (arg->prec == -1 && PyErr_Occurred())
14571 return -1;
14572 if (arg->prec < 0)
14573 arg->prec = 0;
14574 if (--ctx->fmtcnt >= 0) {
14575 arg->ch = FORMAT_READ(ctx);
14576 ctx->fmtpos++;
14577 }
14578 }
14579 else if (arg->ch >= '0' && arg->ch <= '9') {
14580 arg->prec = arg->ch - '0';
14581 while (--ctx->fmtcnt >= 0) {
14582 arg->ch = FORMAT_READ(ctx);
14583 ctx->fmtpos++;
14584 if (arg->ch < '0' || arg->ch > '9')
14585 break;
14586 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14587 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014588 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014589 return -1;
14590 }
14591 arg->prec = arg->prec*10 + (arg->ch - '0');
14592 }
14593 }
14594 }
14595
14596 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14597 if (ctx->fmtcnt >= 0) {
14598 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14599 if (--ctx->fmtcnt >= 0) {
14600 arg->ch = FORMAT_READ(ctx);
14601 ctx->fmtpos++;
14602 }
14603 }
14604 }
14605 if (ctx->fmtcnt < 0) {
14606 PyErr_SetString(PyExc_ValueError,
14607 "incomplete format");
14608 return -1;
14609 }
14610 return 0;
14611
14612#undef FORMAT_READ
14613}
14614
14615/* Format one argument. Supported conversion specifiers:
14616
14617 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014618 - "i", "d", "u": int or float
14619 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014620 - "e", "E", "f", "F", "g", "G": float
14621 - "c": int or str (1 character)
14622
Victor Stinner8dbd4212012-12-04 09:30:24 +010014623 When possible, the output is written directly into the Unicode writer
14624 (ctx->writer). A string is created when padding is required.
14625
Victor Stinnera47082312012-10-04 02:19:54 +020014626 Return 0 if the argument has been formatted into *p_str,
14627 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014628 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014629static int
14630unicode_format_arg_format(struct unicode_formatter_t *ctx,
14631 struct unicode_format_arg_t *arg,
14632 PyObject **p_str)
14633{
14634 PyObject *v;
14635 _PyUnicodeWriter *writer = &ctx->writer;
14636
14637 if (ctx->fmtcnt == 0)
14638 ctx->writer.overallocate = 0;
14639
Victor Stinnera47082312012-10-04 02:19:54 +020014640 v = unicode_format_getnextarg(ctx);
14641 if (v == NULL)
14642 return -1;
14643
Victor Stinnera47082312012-10-04 02:19:54 +020014644
14645 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014646 case 's':
14647 case 'r':
14648 case 'a':
14649 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14650 /* Fast path */
14651 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14652 return -1;
14653 return 1;
14654 }
14655
14656 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14657 *p_str = v;
14658 Py_INCREF(*p_str);
14659 }
14660 else {
14661 if (arg->ch == 's')
14662 *p_str = PyObject_Str(v);
14663 else if (arg->ch == 'r')
14664 *p_str = PyObject_Repr(v);
14665 else
14666 *p_str = PyObject_ASCII(v);
14667 }
14668 break;
14669
14670 case 'i':
14671 case 'd':
14672 case 'u':
14673 case 'o':
14674 case 'x':
14675 case 'X':
14676 {
14677 int ret = mainformatlong(v, arg, p_str, writer);
14678 if (ret != 0)
14679 return ret;
14680 arg->sign = 1;
14681 break;
14682 }
14683
14684 case 'e':
14685 case 'E':
14686 case 'f':
14687 case 'F':
14688 case 'g':
14689 case 'G':
14690 if (arg->width == -1 && arg->prec == -1
14691 && !(arg->flags & (F_SIGN | F_BLANK)))
14692 {
14693 /* Fast path */
14694 if (formatfloat(v, arg, NULL, writer) == -1)
14695 return -1;
14696 return 1;
14697 }
14698
14699 arg->sign = 1;
14700 if (formatfloat(v, arg, p_str, NULL) == -1)
14701 return -1;
14702 break;
14703
14704 case 'c':
14705 {
14706 Py_UCS4 ch = formatchar(v);
14707 if (ch == (Py_UCS4) -1)
14708 return -1;
14709 if (arg->width == -1 && arg->prec == -1) {
14710 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014711 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014712 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014713 return 1;
14714 }
14715 *p_str = PyUnicode_FromOrdinal(ch);
14716 break;
14717 }
14718
14719 default:
14720 PyErr_Format(PyExc_ValueError,
14721 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014722 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014723 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14724 (int)arg->ch,
14725 ctx->fmtpos - 1);
14726 return -1;
14727 }
14728 if (*p_str == NULL)
14729 return -1;
14730 assert (PyUnicode_Check(*p_str));
14731 return 0;
14732}
14733
14734static int
14735unicode_format_arg_output(struct unicode_formatter_t *ctx,
14736 struct unicode_format_arg_t *arg,
14737 PyObject *str)
14738{
14739 Py_ssize_t len;
14740 enum PyUnicode_Kind kind;
14741 void *pbuf;
14742 Py_ssize_t pindex;
14743 Py_UCS4 signchar;
14744 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014745 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014746 Py_ssize_t sublen;
14747 _PyUnicodeWriter *writer = &ctx->writer;
14748 Py_UCS4 fill;
14749
14750 fill = ' ';
14751 if (arg->sign && arg->flags & F_ZERO)
14752 fill = '0';
14753
14754 if (PyUnicode_READY(str) == -1)
14755 return -1;
14756
14757 len = PyUnicode_GET_LENGTH(str);
14758 if ((arg->width == -1 || arg->width <= len)
14759 && (arg->prec == -1 || arg->prec >= len)
14760 && !(arg->flags & (F_SIGN | F_BLANK)))
14761 {
14762 /* Fast path */
14763 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14764 return -1;
14765 return 0;
14766 }
14767
14768 /* Truncate the string for "s", "r" and "a" formats
14769 if the precision is set */
14770 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14771 if (arg->prec >= 0 && len > arg->prec)
14772 len = arg->prec;
14773 }
14774
14775 /* Adjust sign and width */
14776 kind = PyUnicode_KIND(str);
14777 pbuf = PyUnicode_DATA(str);
14778 pindex = 0;
14779 signchar = '\0';
14780 if (arg->sign) {
14781 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14782 if (ch == '-' || ch == '+') {
14783 signchar = ch;
14784 len--;
14785 pindex++;
14786 }
14787 else if (arg->flags & F_SIGN)
14788 signchar = '+';
14789 else if (arg->flags & F_BLANK)
14790 signchar = ' ';
14791 else
14792 arg->sign = 0;
14793 }
14794 if (arg->width < len)
14795 arg->width = len;
14796
14797 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014798 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014799 if (!(arg->flags & F_LJUST)) {
14800 if (arg->sign) {
14801 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014802 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014803 }
14804 else {
14805 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014806 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014807 }
14808 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014809 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14810 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014811 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 }
14813
Victor Stinnera47082312012-10-04 02:19:54 +020014814 buflen = arg->width;
14815 if (arg->sign && len == arg->width)
14816 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014817 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014818 return -1;
14819
14820 /* Write the sign if needed */
14821 if (arg->sign) {
14822 if (fill != ' ') {
14823 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14824 writer->pos += 1;
14825 }
14826 if (arg->width > len)
14827 arg->width--;
14828 }
14829
14830 /* Write the numeric prefix for "x", "X" and "o" formats
14831 if the alternate form is used.
14832 For example, write "0x" for the "%#x" format. */
14833 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14834 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14835 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14836 if (fill != ' ') {
14837 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14838 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14839 writer->pos += 2;
14840 pindex += 2;
14841 }
14842 arg->width -= 2;
14843 if (arg->width < 0)
14844 arg->width = 0;
14845 len -= 2;
14846 }
14847
14848 /* Pad left with the fill character if needed */
14849 if (arg->width > len && !(arg->flags & F_LJUST)) {
14850 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014851 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014852 writer->pos += sublen;
14853 arg->width = len;
14854 }
14855
14856 /* If padding with spaces: write sign if needed and/or numeric prefix if
14857 the alternate form is used */
14858 if (fill == ' ') {
14859 if (arg->sign) {
14860 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14861 writer->pos += 1;
14862 }
14863 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14864 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14865 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14866 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14867 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14868 writer->pos += 2;
14869 pindex += 2;
14870 }
14871 }
14872
14873 /* Write characters */
14874 if (len) {
14875 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14876 str, pindex, len);
14877 writer->pos += len;
14878 }
14879
14880 /* Pad right with the fill character if needed */
14881 if (arg->width > len) {
14882 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014883 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014884 writer->pos += sublen;
14885 }
14886 return 0;
14887}
14888
14889/* Helper of PyUnicode_Format(): format one arg.
14890 Return 0 on success, raise an exception and return -1 on error. */
14891static int
14892unicode_format_arg(struct unicode_formatter_t *ctx)
14893{
14894 struct unicode_format_arg_t arg;
14895 PyObject *str;
14896 int ret;
14897
Victor Stinner8dbd4212012-12-04 09:30:24 +010014898 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014899 if (arg.ch == '%') {
14900 ctx->fmtpos++;
14901 ctx->fmtcnt--;
14902 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14903 return -1;
14904 return 0;
14905 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014906 arg.flags = 0;
14907 arg.width = -1;
14908 arg.prec = -1;
14909 arg.sign = 0;
14910 str = NULL;
14911
Victor Stinnera47082312012-10-04 02:19:54 +020014912 ret = unicode_format_arg_parse(ctx, &arg);
14913 if (ret == -1)
14914 return -1;
14915
14916 ret = unicode_format_arg_format(ctx, &arg, &str);
14917 if (ret == -1)
14918 return -1;
14919
14920 if (ret != 1) {
14921 ret = unicode_format_arg_output(ctx, &arg, str);
14922 Py_DECREF(str);
14923 if (ret == -1)
14924 return -1;
14925 }
14926
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014927 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014928 PyErr_SetString(PyExc_TypeError,
14929 "not all arguments converted during string formatting");
14930 return -1;
14931 }
14932 return 0;
14933}
14934
Alexander Belopolsky40018472011-02-26 01:02:56 +000014935PyObject *
14936PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014937{
Victor Stinnera47082312012-10-04 02:19:54 +020014938 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014939
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014941 PyErr_BadInternalCall();
14942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 }
Victor Stinnera47082312012-10-04 02:19:54 +020014944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014945 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014946 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014947
14948 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014949 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14950 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14951 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14952 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014953
Victor Stinner8f674cc2013-04-17 23:02:17 +020014954 _PyUnicodeWriter_Init(&ctx.writer);
14955 ctx.writer.min_length = ctx.fmtcnt + 100;
14956 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014957
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014959 ctx.arglen = PyTuple_Size(args);
14960 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961 }
14962 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014963 ctx.arglen = -1;
14964 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014965 }
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014967 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014968 ctx.dict = args;
14969 else
14970 ctx.dict = NULL;
14971 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014972
Victor Stinnera47082312012-10-04 02:19:54 +020014973 while (--ctx.fmtcnt >= 0) {
14974 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014975 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014976
14977 nonfmtpos = ctx.fmtpos++;
14978 while (ctx.fmtcnt >= 0 &&
14979 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14980 ctx.fmtpos++;
14981 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014982 }
Victor Stinnera47082312012-10-04 02:19:54 +020014983 if (ctx.fmtcnt < 0) {
14984 ctx.fmtpos--;
14985 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014986 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014987
Victor Stinnercfc4c132013-04-03 01:48:39 +020014988 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14989 nonfmtpos, ctx.fmtpos) < 0)
14990 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 }
14992 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014993 ctx.fmtpos++;
14994 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014995 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014996 }
14997 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014998
Victor Stinnera47082312012-10-04 02:19:54 +020014999 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015000 PyErr_SetString(PyExc_TypeError,
15001 "not all arguments converted during string formatting");
15002 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015003 }
15004
Victor Stinnera47082312012-10-04 02:19:54 +020015005 if (ctx.args_owned) {
15006 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015007 }
Victor Stinnera47082312012-10-04 02:19:54 +020015008 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015009
Benjamin Peterson29060642009-01-31 22:14:21 +000015010 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015011 _PyUnicodeWriter_Dealloc(&ctx.writer);
15012 if (ctx.args_owned) {
15013 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015014 }
15015 return NULL;
15016}
15017
Jeremy Hylton938ace62002-07-17 16:30:39 +000015018static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015019unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15020
Tim Peters6d6c1a32001-08-02 04:15:00 +000015021static PyObject *
15022unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15023{
Benjamin Peterson29060642009-01-31 22:14:21 +000015024 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 static char *kwlist[] = {"object", "encoding", "errors", 0};
15026 char *encoding = NULL;
15027 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015028
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 if (type != &PyUnicode_Type)
15030 return unicode_subtype_new(type, args, kwds);
15031 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015032 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 return NULL;
15034 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015035 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 if (encoding == NULL && errors == NULL)
15037 return PyObject_Str(x);
15038 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015039 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015040}
15041
Guido van Rossume023fe02001-08-30 03:12:59 +000015042static PyObject *
15043unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15044{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015045 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015046 Py_ssize_t length, char_size;
15047 int share_wstr, share_utf8;
15048 unsigned int kind;
15049 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015050
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015053 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015055 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015056 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015057 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015058 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015059 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015060 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015062 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015063 if (self == NULL) {
15064 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 return NULL;
15066 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015067 kind = PyUnicode_KIND(unicode);
15068 length = PyUnicode_GET_LENGTH(unicode);
15069
15070 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015071#ifdef Py_DEBUG
15072 _PyUnicode_HASH(self) = -1;
15073#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015074 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015075#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076 _PyUnicode_STATE(self).interned = 0;
15077 _PyUnicode_STATE(self).kind = kind;
15078 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015079 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015080 _PyUnicode_STATE(self).ready = 1;
15081 _PyUnicode_WSTR(self) = NULL;
15082 _PyUnicode_UTF8_LENGTH(self) = 0;
15083 _PyUnicode_UTF8(self) = NULL;
15084 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015085 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015086
15087 share_utf8 = 0;
15088 share_wstr = 0;
15089 if (kind == PyUnicode_1BYTE_KIND) {
15090 char_size = 1;
15091 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15092 share_utf8 = 1;
15093 }
15094 else if (kind == PyUnicode_2BYTE_KIND) {
15095 char_size = 2;
15096 if (sizeof(wchar_t) == 2)
15097 share_wstr = 1;
15098 }
15099 else {
15100 assert(kind == PyUnicode_4BYTE_KIND);
15101 char_size = 4;
15102 if (sizeof(wchar_t) == 4)
15103 share_wstr = 1;
15104 }
15105
15106 /* Ensure we won't overflow the length. */
15107 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15108 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015109 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015111 data = PyObject_MALLOC((length + 1) * char_size);
15112 if (data == NULL) {
15113 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015114 goto onError;
15115 }
15116
Victor Stinnerc3c74152011-10-02 20:39:55 +020015117 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015118 if (share_utf8) {
15119 _PyUnicode_UTF8_LENGTH(self) = length;
15120 _PyUnicode_UTF8(self) = data;
15121 }
15122 if (share_wstr) {
15123 _PyUnicode_WSTR_LENGTH(self) = length;
15124 _PyUnicode_WSTR(self) = (wchar_t *)data;
15125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015126
Christian Heimesf051e432016-09-13 20:22:02 +020015127 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015128 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015129 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015130#ifdef Py_DEBUG
15131 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15132#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015133 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015134 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015135
15136onError:
15137 Py_DECREF(unicode);
15138 Py_DECREF(self);
15139 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015140}
15141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015142PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015143"str(object='') -> str\n\
15144str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015145\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015146Create a new string object from the given object. If encoding or\n\
15147errors is specified, then the object must expose a data buffer\n\
15148that will be decoded using the given encoding and error handler.\n\
15149Otherwise, returns the result of object.__str__() (if defined)\n\
15150or repr(object).\n\
15151encoding defaults to sys.getdefaultencoding().\n\
15152errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015153
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015154static PyObject *unicode_iter(PyObject *seq);
15155
Guido van Rossumd57fd912000-03-10 22:53:23 +000015156PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015157 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015158 "str", /* tp_name */
15159 sizeof(PyUnicodeObject), /* tp_basicsize */
15160 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015161 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015162 (destructor)unicode_dealloc, /* tp_dealloc */
15163 0, /* tp_print */
15164 0, /* tp_getattr */
15165 0, /* tp_setattr */
15166 0, /* tp_reserved */
15167 unicode_repr, /* tp_repr */
15168 &unicode_as_number, /* tp_as_number */
15169 &unicode_as_sequence, /* tp_as_sequence */
15170 &unicode_as_mapping, /* tp_as_mapping */
15171 (hashfunc) unicode_hash, /* tp_hash*/
15172 0, /* tp_call*/
15173 (reprfunc) unicode_str, /* tp_str */
15174 PyObject_GenericGetAttr, /* tp_getattro */
15175 0, /* tp_setattro */
15176 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015178 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15179 unicode_doc, /* tp_doc */
15180 0, /* tp_traverse */
15181 0, /* tp_clear */
15182 PyUnicode_RichCompare, /* tp_richcompare */
15183 0, /* tp_weaklistoffset */
15184 unicode_iter, /* tp_iter */
15185 0, /* tp_iternext */
15186 unicode_methods, /* tp_methods */
15187 0, /* tp_members */
15188 0, /* tp_getset */
15189 &PyBaseObject_Type, /* tp_base */
15190 0, /* tp_dict */
15191 0, /* tp_descr_get */
15192 0, /* tp_descr_set */
15193 0, /* tp_dictoffset */
15194 0, /* tp_init */
15195 0, /* tp_alloc */
15196 unicode_new, /* tp_new */
15197 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015198};
15199
15200/* Initialize the Unicode implementation */
15201
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015202_PyInitError
15203_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015204{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015205 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015206 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015207 0x000A, /* LINE FEED */
15208 0x000D, /* CARRIAGE RETURN */
15209 0x001C, /* FILE SEPARATOR */
15210 0x001D, /* GROUP SEPARATOR */
15211 0x001E, /* RECORD SEPARATOR */
15212 0x0085, /* NEXT LINE */
15213 0x2028, /* LINE SEPARATOR */
15214 0x2029, /* PARAGRAPH SEPARATOR */
15215 };
15216
Fred Drakee4315f52000-05-09 19:53:39 +000015217 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015218 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015219 if (!unicode_empty) {
15220 return _Py_INIT_ERR("Can't create empty string");
15221 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015222 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015223
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015224 if (PyType_Ready(&PyUnicode_Type) < 0) {
15225 return _Py_INIT_ERR("Can't initialize unicode type");
15226 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015227
15228 /* initialize the linebreak bloom filter */
15229 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015230 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015231 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015232
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015233 if (PyType_Ready(&EncodingMapType) < 0) {
15234 return _Py_INIT_ERR("Can't initialize encoding map type");
15235 }
15236 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15237 return _Py_INIT_ERR("Can't initialize field name iterator type");
15238 }
15239 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15240 return _Py_INIT_ERR("Can't initialize formatter iter type");
15241 }
Victor Stinner3a50e702011-10-18 21:21:00 +020015242
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015243 return _Py_INIT_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015244}
15245
15246/* Finalize the Unicode implementation */
15247
Christian Heimesa156e092008-02-16 07:38:31 +000015248int
15249PyUnicode_ClearFreeList(void)
15250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015251 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015252}
15253
Guido van Rossumd57fd912000-03-10 22:53:23 +000015254void
Thomas Wouters78890102000-07-22 19:25:51 +000015255_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015256{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015257 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015258
Serhiy Storchaka05997252013-01-26 12:14:02 +020015259 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015260
Serhiy Storchaka05997252013-01-26 12:14:02 +020015261 for (i = 0; i < 256; i++)
15262 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015263 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015264 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015265}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015266
Walter Dörwald16807132007-05-25 13:52:07 +000015267void
15268PyUnicode_InternInPlace(PyObject **p)
15269{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015270 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015272#ifdef Py_DEBUG
15273 assert(s != NULL);
15274 assert(_PyUnicode_CHECK(s));
15275#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015277 return;
15278#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 /* If it's a subclass, we don't really know what putting
15280 it in the interned dict might do. */
15281 if (!PyUnicode_CheckExact(s))
15282 return;
15283 if (PyUnicode_CHECK_INTERNED(s))
15284 return;
15285 if (interned == NULL) {
15286 interned = PyDict_New();
15287 if (interned == NULL) {
15288 PyErr_Clear(); /* Don't leave an exception */
15289 return;
15290 }
15291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015293 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015295 if (t == NULL) {
15296 PyErr_Clear();
15297 return;
15298 }
15299 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015300 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015301 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015302 return;
15303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 /* The two references in interned are not counted by refcnt.
15305 The deallocator will take care of this */
15306 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015307 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015308}
15309
15310void
15311PyUnicode_InternImmortal(PyObject **p)
15312{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 PyUnicode_InternInPlace(p);
15314 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015315 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 Py_INCREF(*p);
15317 }
Walter Dörwald16807132007-05-25 13:52:07 +000015318}
15319
15320PyObject *
15321PyUnicode_InternFromString(const char *cp)
15322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 PyObject *s = PyUnicode_FromString(cp);
15324 if (s == NULL)
15325 return NULL;
15326 PyUnicode_InternInPlace(&s);
15327 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015328}
15329
Alexander Belopolsky40018472011-02-26 01:02:56 +000015330void
15331_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015333 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015334 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 Py_ssize_t i, n;
15336 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015337
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 if (interned == NULL || !PyDict_Check(interned))
15339 return;
15340 keys = PyDict_Keys(interned);
15341 if (keys == NULL || !PyList_Check(keys)) {
15342 PyErr_Clear();
15343 return;
15344 }
Walter Dörwald16807132007-05-25 13:52:07 +000015345
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15347 detector, interned unicode strings are not forcibly deallocated;
15348 rather, we give them their stolen references back, and then clear
15349 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015350
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 n = PyList_GET_SIZE(keys);
15352 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015353 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015355 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015356 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015357 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015359 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 case SSTATE_NOT_INTERNED:
15361 /* XXX Shouldn't happen */
15362 break;
15363 case SSTATE_INTERNED_IMMORTAL:
15364 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015365 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 break;
15367 case SSTATE_INTERNED_MORTAL:
15368 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 break;
15371 default:
15372 Py_FatalError("Inconsistent interned string state.");
15373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015374 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 }
15376 fprintf(stderr, "total size of all interned strings: "
15377 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15378 "mortal/immortal\n", mortal_size, immortal_size);
15379 Py_DECREF(keys);
15380 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015381 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015382}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015383
15384
15385/********************* Unicode Iterator **************************/
15386
15387typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 PyObject_HEAD
15389 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015390 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015391} unicodeiterobject;
15392
15393static void
15394unicodeiter_dealloc(unicodeiterobject *it)
15395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 _PyObject_GC_UNTRACK(it);
15397 Py_XDECREF(it->it_seq);
15398 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015399}
15400
15401static int
15402unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015404 Py_VISIT(it->it_seq);
15405 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015406}
15407
15408static PyObject *
15409unicodeiter_next(unicodeiterobject *it)
15410{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015411 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015412
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 assert(it != NULL);
15414 seq = it->it_seq;
15415 if (seq == NULL)
15416 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015417 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015419 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15420 int kind = PyUnicode_KIND(seq);
15421 void *data = PyUnicode_DATA(seq);
15422 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15423 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 if (item != NULL)
15425 ++it->it_index;
15426 return item;
15427 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015428
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015430 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015432}
15433
15434static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015435unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015436{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 Py_ssize_t len = 0;
15438 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015439 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015441}
15442
15443PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15444
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015445static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015446unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015447{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015448 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015449 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015450 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 it->it_seq, it->it_index);
15452 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015453 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015454 if (u == NULL)
15455 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015456 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457 }
15458}
15459
15460PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15461
15462static PyObject *
15463unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15464{
15465 Py_ssize_t index = PyLong_AsSsize_t(state);
15466 if (index == -1 && PyErr_Occurred())
15467 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015468 if (it->it_seq != NULL) {
15469 if (index < 0)
15470 index = 0;
15471 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15472 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15473 it->it_index = index;
15474 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015475 Py_RETURN_NONE;
15476}
15477
15478PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15479
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015482 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015483 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15484 reduce_doc},
15485 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15486 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015488};
15489
15490PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015491 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15492 "str_iterator", /* tp_name */
15493 sizeof(unicodeiterobject), /* tp_basicsize */
15494 0, /* tp_itemsize */
15495 /* methods */
15496 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15497 0, /* tp_print */
15498 0, /* tp_getattr */
15499 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015500 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 0, /* tp_repr */
15502 0, /* tp_as_number */
15503 0, /* tp_as_sequence */
15504 0, /* tp_as_mapping */
15505 0, /* tp_hash */
15506 0, /* tp_call */
15507 0, /* tp_str */
15508 PyObject_GenericGetAttr, /* tp_getattro */
15509 0, /* tp_setattro */
15510 0, /* tp_as_buffer */
15511 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15512 0, /* tp_doc */
15513 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15514 0, /* tp_clear */
15515 0, /* tp_richcompare */
15516 0, /* tp_weaklistoffset */
15517 PyObject_SelfIter, /* tp_iter */
15518 (iternextfunc)unicodeiter_next, /* tp_iternext */
15519 unicodeiter_methods, /* tp_methods */
15520 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015521};
15522
15523static PyObject *
15524unicode_iter(PyObject *seq)
15525{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015526 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015527
Benjamin Peterson14339b62009-01-31 16:36:08 +000015528 if (!PyUnicode_Check(seq)) {
15529 PyErr_BadInternalCall();
15530 return NULL;
15531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015532 if (PyUnicode_READY(seq) == -1)
15533 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015534 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15535 if (it == NULL)
15536 return NULL;
15537 it->it_index = 0;
15538 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015539 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015540 _PyObject_GC_TRACK(it);
15541 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015542}
15543
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015544
15545size_t
15546Py_UNICODE_strlen(const Py_UNICODE *u)
15547{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015548 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015549}
15550
15551Py_UNICODE*
15552Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15553{
15554 Py_UNICODE *u = s1;
15555 while ((*u++ = *s2++));
15556 return s1;
15557}
15558
15559Py_UNICODE*
15560Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15561{
15562 Py_UNICODE *u = s1;
15563 while ((*u++ = *s2++))
15564 if (n-- == 0)
15565 break;
15566 return s1;
15567}
15568
15569Py_UNICODE*
15570Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15571{
15572 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015573 u1 += wcslen(u1);
15574 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015575 return s1;
15576}
15577
15578int
15579Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15580{
15581 while (*s1 && *s2 && *s1 == *s2)
15582 s1++, s2++;
15583 if (*s1 && *s2)
15584 return (*s1 < *s2) ? -1 : +1;
15585 if (*s1)
15586 return 1;
15587 if (*s2)
15588 return -1;
15589 return 0;
15590}
15591
15592int
15593Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15594{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015595 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015596 for (; n != 0; n--) {
15597 u1 = *s1;
15598 u2 = *s2;
15599 if (u1 != u2)
15600 return (u1 < u2) ? -1 : +1;
15601 if (u1 == '\0')
15602 return 0;
15603 s1++;
15604 s2++;
15605 }
15606 return 0;
15607}
15608
15609Py_UNICODE*
15610Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15611{
15612 const Py_UNICODE *p;
15613 for (p = s; *p; p++)
15614 if (*p == c)
15615 return (Py_UNICODE*)p;
15616 return NULL;
15617}
15618
15619Py_UNICODE*
15620Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15621{
15622 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015623 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015624 while (p != s) {
15625 p--;
15626 if (*p == c)
15627 return (Py_UNICODE*)p;
15628 }
15629 return NULL;
15630}
Victor Stinner331ea922010-08-10 16:37:20 +000015631
Victor Stinner71133ff2010-09-01 23:43:53 +000015632Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015633PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015634{
Victor Stinner577db2c2011-10-11 22:12:48 +020015635 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015636 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015638 if (!PyUnicode_Check(unicode)) {
15639 PyErr_BadArgument();
15640 return NULL;
15641 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015642 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015643 if (u == NULL)
15644 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015645 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015646 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015647 PyErr_NoMemory();
15648 return NULL;
15649 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015650 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015651 size *= sizeof(Py_UNICODE);
15652 copy = PyMem_Malloc(size);
15653 if (copy == NULL) {
15654 PyErr_NoMemory();
15655 return NULL;
15656 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015657 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015658 return copy;
15659}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015660
Georg Brandl66c221e2010-10-14 07:04:07 +000015661/* A _string module, to export formatter_parser and formatter_field_name_split
15662 to the string.Formatter class implemented in Python. */
15663
15664static PyMethodDef _string_methods[] = {
15665 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15666 METH_O, PyDoc_STR("split the argument as a field name")},
15667 {"formatter_parser", (PyCFunction) formatter_parser,
15668 METH_O, PyDoc_STR("parse the argument as a format string")},
15669 {NULL, NULL}
15670};
15671
15672static struct PyModuleDef _string_module = {
15673 PyModuleDef_HEAD_INIT,
15674 "_string",
15675 PyDoc_STR("string helper module"),
15676 0,
15677 _string_methods,
15678 NULL,
15679 NULL,
15680 NULL,
15681 NULL
15682};
15683
15684PyMODINIT_FUNC
15685PyInit__string(void)
15686{
15687 return PyModule_Create(&_string_module);
15688}
15689
15690
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015691#ifdef __cplusplus
15692}
15693#endif