blob: 9d3ed0d18b159e60924295cc5a2fc5ccf59d7dc2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner59423e32018-11-26 13:40:01 +0100223static inline void
224unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
225 Py_ssize_t start, Py_ssize_t length)
226{
227 assert(0 <= start);
228 assert(kind != PyUnicode_WCHAR_KIND);
229 switch (kind) {
230 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100231 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100232 Py_UCS1 ch = (unsigned char)value;
233 Py_UCS1 *to = (Py_UCS1 *)data + start;
234 memset(to, ch, length);
235 break;
236 }
237 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS2 ch = (Py_UCS2)value;
240 Py_UCS2 *to = (Py_UCS2 *)data + start;
241 const Py_UCS2 *end = to + length;
242 for (; to < end; ++to) *to = ch;
243 break;
244 }
245 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS4 ch = value;
248 Py_UCS4 * to = (Py_UCS4 *)data + start;
249 const Py_UCS4 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 default: Py_UNREACHABLE();
254 }
255}
256
257
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200258/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700259static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200260_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
261
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200262/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265/* Single character Unicode strings in the Latin-1 range are being
266 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200267static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268
Christian Heimes190d79e2008-01-30 11:58:22 +0000269/* Fast detection of the most frequent whitespace characters */
270const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x000C: * FORM FEED */
276/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 1, 1, 1, 1, 1, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x001C: * FILE SEPARATOR */
280/* case 0x001D: * GROUP SEPARATOR */
281/* case 0x001E: * RECORD SEPARATOR */
282/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 1, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000289
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000298};
299
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200300/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200302static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100303static int unicode_modifiable(PyObject *unicode);
304
Victor Stinnerfe226c02011-10-03 03:52:20 +0200305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100307_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200308static PyObject *
309_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
310static PyObject *
311_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
312
313static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000315 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100316 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000317 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
318
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319static void
320raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300321 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100322 PyObject *unicode,
323 Py_ssize_t startpos, Py_ssize_t endpos,
324 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000325
Christian Heimes190d79e2008-01-30 11:58:22 +0000326/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200327static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000329/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000330/* 0x000B, * LINE TABULATION */
331/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000333 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x001C, * FILE SEPARATOR */
336/* 0x001D, * GROUP SEPARATOR */
337/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 1, 1, 1, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000343
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000352};
353
INADA Naoki3ae20562017-01-16 20:41:20 +0900354static int convert_uc(PyObject *obj, void *addr);
355
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300356#include "clinic/unicodeobject.c.h"
357
Victor Stinner3d4226a2018-08-29 22:21:32 +0200358_Py_error_handler
359_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200403#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
404
Victor Stinner910337b2011-10-03 03:20:16 +0200405 PyASCIIObject *ascii;
406 unsigned int kind;
407
Victor Stinner50fe3f82018-10-26 18:47:15 +0200408 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200409
410 ascii = (PyASCIIObject *)op;
411 kind = ascii->state.kind;
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(kind == PyUnicode_1BYTE_KIND);
415 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200416 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200418 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200419 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200420
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 if (ascii->state.compact == 1) {
422 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200423 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200424 || kind == PyUnicode_2BYTE_KIND
425 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200426 ASSERT(ascii->state.ascii == 0);
427 ASSERT(ascii->state.ready == 1);
428 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100429 }
430 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
432
433 data = unicode->data.any;
434 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->length == 0);
436 ASSERT(ascii->hash == -1);
437 ASSERT(ascii->state.compact == 0);
438 ASSERT(ascii->state.ascii == 0);
439 ASSERT(ascii->state.ready == 0);
440 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
441 ASSERT(ascii->wstr != NULL);
442 ASSERT(data == NULL);
443 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200444 }
445 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200446 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 || kind == PyUnicode_2BYTE_KIND
448 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200449 ASSERT(ascii->state.compact == 0);
450 ASSERT(ascii->state.ready == 1);
451 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200452 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200453 ASSERT (compact->utf8 == data);
454 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 }
456 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200457 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 }
460 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 if (
462#if SIZEOF_WCHAR_T == 2
463 kind == PyUnicode_2BYTE_KIND
464#else
465 kind == PyUnicode_4BYTE_KIND
466#endif
467 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200468 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(ascii->wstr == data);
470 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200473 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200474
475 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200477 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 /* check that the best kind is used */
481 if (check_content && kind != PyUnicode_WCHAR_KIND)
482 {
483 Py_ssize_t i;
484 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200485 void *data;
486 Py_UCS4 ch;
487
488 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200489 for (i=0; i < ascii->length; i++)
490 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 if (ch > maxchar)
493 maxchar = ch;
494 }
495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100496 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200497 ASSERT(maxchar >= 128);
498 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100499 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200500 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200501 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 }
Victor Stinner77faf692011-11-20 18:56:05 +0100503 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200504 ASSERT(maxchar >= 0x100);
505 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100506 }
507 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200508 ASSERT(maxchar >= 0x10000);
509 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100510 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200511 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200512 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400513 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200514
515#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516}
Victor Stinner910337b2011-10-03 03:20:16 +0200517#endif
518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002581 if (precision == -1) {
2582 length = strlen(str);
2583 }
2584 else {
2585 length = 0;
2586 while (length < precision && str[length]) {
2587 length++;
2588 }
2589 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2591 if (unicode == NULL)
2592 return -1;
2593
2594 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2595 Py_DECREF(unicode);
2596 return res;
2597}
2598
Victor Stinner96865452011-03-01 23:44:09 +00002599static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002600unicode_fromformat_arg(_PyUnicodeWriter *writer,
2601 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002602{
Victor Stinnere215d962012-10-06 23:03:36 +02002603 const char *p;
2604 Py_ssize_t len;
2605 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t width;
2607 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 int longflag;
2609 int longlongflag;
2610 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002612
2613 p = f;
2614 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002615 zeropad = 0;
2616 if (*f == '0') {
2617 zeropad = 1;
2618 f++;
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620
2621 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 width = -1;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002625 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002626 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002628 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002630 return NULL;
2631 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002633 f++;
2634 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002635 }
2636 precision = -1;
2637 if (*f == '.') {
2638 f++;
2639 if (Py_ISDIGIT((unsigned)*f)) {
2640 precision = (*f - '0');
2641 f++;
2642 while (Py_ISDIGIT((unsigned)*f)) {
2643 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2644 PyErr_SetString(PyExc_ValueError,
2645 "precision too big");
2646 return NULL;
2647 }
2648 precision = (precision * 10) + (*f - '0');
2649 f++;
2650 }
2651 }
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == '%') {
2653 /* "%.3%s" => f points to "3" */
2654 f--;
2655 }
2656 }
2657 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002659 f--;
2660 }
Victor Stinner96865452011-03-01 23:44:09 +00002661
2662 /* Handle %ld, %lu, %lld and %llu. */
2663 longflag = 0;
2664 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002665 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002666 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longflag = 1;
2669 ++f;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 longlongflag = 1;
2674 f += 2;
2675 }
Victor Stinner96865452011-03-01 23:44:09 +00002676 }
2677 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002678 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002679 size_tflag = 1;
2680 ++f;
2681 }
Victor Stinnere215d962012-10-06 23:03:36 +02002682
2683 if (f[1] == '\0')
2684 writer->overallocate = 0;
2685
2686 switch (*f) {
2687 case 'c':
2688 {
2689 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002690 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002691 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002692 "character argument not in range(0x110000)");
2693 return NULL;
2694 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002695 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002696 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002697 break;
2698 }
2699
2700 case 'i':
2701 case 'd':
2702 case 'u':
2703 case 'x':
2704 {
2705 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002706 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002708
2709 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002713 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002714 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002715 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002717 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002718 va_arg(*vargs, size_t));
2719 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, unsigned int));
2722 }
2723 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002724 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
2726 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002730 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002731 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002732 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002733 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002734 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002735 va_arg(*vargs, Py_ssize_t));
2736 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002737 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002738 va_arg(*vargs, int));
2739 }
2740 assert(len >= 0);
2741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 if (precision < len)
2743 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002744
2745 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002746 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2747 return NULL;
2748
Victor Stinnere215d962012-10-06 23:03:36 +02002749 if (width > precision) {
2750 Py_UCS4 fillchar;
2751 fill = width - precision;
2752 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner15a11362012-10-06 23:48:20 +02002757 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002758 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002759 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2760 return NULL;
2761 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763
Victor Stinner4a587072013-11-19 12:54:53 +01002764 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'p':
2770 {
2771 char number[MAX_LONG_LONG_CHARS];
2772
2773 len = sprintf(number, "%p", va_arg(*vargs, void*));
2774 assert(len >= 0);
2775
2776 /* %p is ill-defined: ensure leading 0x. */
2777 if (number[1] == 'X')
2778 number[1] = 'x';
2779 else if (number[1] != 'x') {
2780 memmove(number + 2, number,
2781 strlen(number) + 1);
2782 number[0] = '0';
2783 number[1] = 'x';
2784 len += 2;
2785 }
2786
Victor Stinner4a587072013-11-19 12:54:53 +01002787 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002788 return NULL;
2789 break;
2790 }
2791
2792 case 's':
2793 {
2794 /* UTF-8 */
2795 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002796 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 break;
2799 }
2800
2801 case 'U':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 assert(obj && _PyUnicode_CHECK(obj));
2805
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
2808 break;
2809 }
2810
2811 case 'V':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (obj) {
2816 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
2819 }
2820 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002822 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002823 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002824 }
2825 break;
2826 }
2827
2828 case 'S':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *str;
2832 assert(obj);
2833 str = PyObject_Str(obj);
2834 if (!str)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(str);
2838 return NULL;
2839 }
2840 Py_DECREF(str);
2841 break;
2842 }
2843
2844 case 'R':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *repr;
2848 assert(obj);
2849 repr = PyObject_Repr(obj);
2850 if (!repr)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(repr);
2854 return NULL;
2855 }
2856 Py_DECREF(repr);
2857 break;
2858 }
2859
2860 case 'A':
2861 {
2862 PyObject *obj = va_arg(*vargs, PyObject *);
2863 PyObject *ascii;
2864 assert(obj);
2865 ascii = PyObject_ASCII(obj);
2866 if (!ascii)
2867 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002868 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002869 Py_DECREF(ascii);
2870 return NULL;
2871 }
2872 Py_DECREF(ascii);
2873 break;
2874 }
2875
2876 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002877 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002878 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002879 break;
2880
2881 default:
2882 /* if we stumble upon an unknown formatting code, copy the rest
2883 of the format string to the output string. (we cannot just
2884 skip the code, since there's no way to know what's in the
2885 argument list) */
2886 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002887 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002888 return NULL;
2889 f = p+len;
2890 return f;
2891 }
2892
2893 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002894 return f;
2895}
2896
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897PyObject *
2898PyUnicode_FromFormatV(const char *format, va_list vargs)
2899{
Victor Stinnere215d962012-10-06 23:03:36 +02002900 va_list vargs2;
2901 const char *f;
2902 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002903
Victor Stinner8f674cc2013-04-17 23:02:17 +02002904 _PyUnicodeWriter_Init(&writer);
2905 writer.min_length = strlen(format) + 100;
2906 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002907
Benjamin Peterson0c212142016-09-20 20:39:33 -07002908 // Copy varags to be able to pass a reference to a subfunction.
2909 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 f = unicode_fromformat_arg(&writer, f, &vargs2);
2914 if (f == NULL)
2915 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002918 const char *p;
2919 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920
Victor Stinnere215d962012-10-06 23:03:36 +02002921 p = f;
2922 do
2923 {
2924 if ((unsigned char)*p > 127) {
2925 PyErr_Format(PyExc_ValueError,
2926 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2927 "string, got a non-ASCII byte: 0x%02x",
2928 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002929 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
2931 p++;
2932 }
2933 while (*p != '\0' && *p != '%');
2934 len = p - f;
2935
2936 if (*p == '\0')
2937 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002938
2939 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002940 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002941
2942 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002945 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002946 return _PyUnicodeWriter_Finish(&writer);
2947
2948 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002949 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002950 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954PyObject *
2955PyUnicode_FromFormat(const char *format, ...)
2956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 PyObject* ret;
2958 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959
2960#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002961 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002963 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002964#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002965 ret = PyUnicode_FromFormatV(format, vargs);
2966 va_end(vargs);
2967 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002968}
2969
Serhiy Storchakac46db922018-10-23 22:58:24 +03002970static Py_ssize_t
2971unicode_get_widechar_size(PyObject *unicode)
2972{
2973 Py_ssize_t res;
2974
2975 assert(unicode != NULL);
2976 assert(_PyUnicode_CHECK(unicode));
2977
2978 if (_PyUnicode_WSTR(unicode) != NULL) {
2979 return PyUnicode_WSTR_LENGTH(unicode);
2980 }
2981 assert(PyUnicode_IS_READY(unicode));
2982
2983 res = _PyUnicode_LENGTH(unicode);
2984#if SIZEOF_WCHAR_T == 2
2985 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2986 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2987 const Py_UCS4 *end = s + res;
2988 for (; s < end; ++s) {
2989 if (*s > 0xFFFF) {
2990 ++res;
2991 }
2992 }
2993 }
2994#endif
2995 return res;
2996}
2997
2998static void
2999unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3000{
3001 const wchar_t *wstr;
3002
3003 assert(unicode != NULL);
3004 assert(_PyUnicode_CHECK(unicode));
3005
3006 wstr = _PyUnicode_WSTR(unicode);
3007 if (wstr != NULL) {
3008 memcpy(w, wstr, size * sizeof(wchar_t));
3009 return;
3010 }
3011 assert(PyUnicode_IS_READY(unicode));
3012
3013 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3014 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3015 for (; size--; ++s, ++w) {
3016 *w = *s;
3017 }
3018 }
3019 else {
3020#if SIZEOF_WCHAR_T == 4
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3022 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 *w = *s;
3025 }
3026#else
3027 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3028 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3029 for (; size--; ++s, ++w) {
3030 Py_UCS4 ch = *s;
3031 if (ch > 0xFFFF) {
3032 assert(ch <= MAX_UNICODE);
3033 /* encode surrogate pair in this case */
3034 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3035 if (!size--)
3036 break;
3037 *w = Py_UNICODE_LOW_SURROGATE(ch);
3038 }
3039 else {
3040 *w = ch;
3041 }
3042 }
3043#endif
3044 }
3045}
3046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047#ifdef HAVE_WCHAR_H
3048
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003049/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003050
Victor Stinnerd88d9832011-09-06 02:00:05 +02003051 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003052 character) required to convert the unicode object. Ignore size argument.
3053
Victor Stinnerd88d9832011-09-06 02:00:05 +02003054 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003055 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003056 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003057Py_ssize_t
3058PyUnicode_AsWideChar(PyObject *unicode,
3059 wchar_t *w,
3060 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003061{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003062 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003064 if (unicode == NULL) {
3065 PyErr_BadInternalCall();
3066 return -1;
3067 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003071 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003072
3073 res = unicode_get_widechar_size(unicode);
3074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003076 }
3077
3078 if (size > res) {
3079 size = res + 1;
3080 }
3081 else {
3082 res = size;
3083 }
3084 unicode_copy_as_widechar(unicode, w, size);
3085 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003086}
3087
Victor Stinner137c34c2010-09-29 10:25:54 +00003088wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003089PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003090 Py_ssize_t *size)
3091{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003092 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003093 Py_ssize_t buflen;
3094
3095 if (unicode == NULL) {
3096 PyErr_BadInternalCall();
3097 return NULL;
3098 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003101 return NULL;
3102 }
3103
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 buflen = unicode_get_widechar_size(unicode);
3105 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003106 if (buffer == NULL) {
3107 PyErr_NoMemory();
3108 return NULL;
3109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3111 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003112 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003113 }
3114 else if (wcslen(buffer) != (size_t)buflen) {
3115 PyMem_FREE(buffer);
3116 PyErr_SetString(PyExc_ValueError,
3117 "embedded null character");
3118 return NULL;
3119 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003120 return buffer;
3121}
3122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
Alexander Belopolsky40018472011-02-26 01:02:56 +00003125PyObject *
3126PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003127{
Victor Stinner8faf8212011-12-08 22:14:11 +01003128 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 PyErr_SetString(PyExc_ValueError,
3130 "chr() arg not in range(0x110000)");
3131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003132 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003133
Victor Stinner985a82a2014-01-03 12:53:47 +01003134 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003135}
3136
Alexander Belopolsky40018472011-02-26 01:02:56 +00003137PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003138PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003140 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003142 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003143 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_INCREF(obj);
3146 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003147 }
3148 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 /* For a Unicode subtype that's not a Unicode object,
3150 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003151 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003152 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003154 "Can't convert '%.100s' object to str implicitly",
3155 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003156 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003160PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003163{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003164 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 PyErr_BadInternalCall();
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003172 /* Decoding bytes objects is the most common case and should be fast */
3173 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003174 if (PyBytes_GET_SIZE(obj) == 0)
3175 _Py_RETURN_UNICODE_EMPTY();
3176 v = PyUnicode_Decode(
3177 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3178 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003179 return v;
3180 }
3181
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 PyErr_SetString(PyExc_TypeError,
3184 "decoding str is not supported");
3185 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003186 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003188 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3189 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3190 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003191 "decoding to str: need a bytes-like object, %.80s found",
3192 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003193 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003194 }
Tim Petersced69f82003-09-16 20:30:58 +00003195
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003197 PyBuffer_Release(&buffer);
3198 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003200
Serhiy Storchaka05997252013-01-26 12:14:02 +02003201 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
Victor Stinnerebe17e02016-10-12 13:57:45 +02003206/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3207 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3208 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003209int
3210_Py_normalize_encoding(const char *encoding,
3211 char *lower,
3212 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003214 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003215 char *l;
3216 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003217 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218
Victor Stinner942889a2016-09-05 15:40:10 -07003219 assert(encoding != NULL);
3220
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003221 e = encoding;
3222 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003223 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003224 punct = 0;
3225 while (1) {
3226 char c = *e;
3227 if (c == 0) {
3228 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003229 }
Victor Stinner942889a2016-09-05 15:40:10 -07003230
3231 if (Py_ISALNUM(c) || c == '.') {
3232 if (punct && l != lower) {
3233 if (l == l_end) {
3234 return 0;
3235 }
3236 *l++ = '_';
3237 }
3238 punct = 0;
3239
3240 if (l == l_end) {
3241 return 0;
3242 }
3243 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003246 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003247 }
Victor Stinner942889a2016-09-05 15:40:10 -07003248
3249 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003250 }
3251 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003252 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253}
3254
Alexander Belopolsky40018472011-02-26 01:02:56 +00003255PyObject *
3256PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003257 Py_ssize_t size,
3258 const char *encoding,
3259 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003260{
3261 PyObject *buffer = NULL, *unicode;
3262 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003263 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3264
3265 if (encoding == NULL) {
3266 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3267 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003268
Fred Drakee4315f52000-05-09 19:53:39 +00003269 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003270 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3271 char *lower = buflower;
3272
3273 /* Fast paths */
3274 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3275 lower += 3;
3276 if (*lower == '_') {
3277 /* Match "utf8" and "utf_8" */
3278 lower++;
3279 }
3280
3281 if (lower[0] == '8' && lower[1] == 0) {
3282 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3283 }
3284 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3285 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3286 }
3287 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3288 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3289 }
3290 }
3291 else {
3292 if (strcmp(lower, "ascii") == 0
3293 || strcmp(lower, "us_ascii") == 0) {
3294 return PyUnicode_DecodeASCII(s, size, errors);
3295 }
Steve Dowercc16be82016-09-08 10:35:16 -07003296 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003297 else if (strcmp(lower, "mbcs") == 0) {
3298 return PyUnicode_DecodeMBCS(s, size, errors);
3299 }
3300 #endif
3301 else if (strcmp(lower, "latin1") == 0
3302 || strcmp(lower, "latin_1") == 0
3303 || strcmp(lower, "iso_8859_1") == 0
3304 || strcmp(lower, "iso8859_1") == 0) {
3305 return PyUnicode_DecodeLatin1(s, size, errors);
3306 }
3307 }
Victor Stinner37296e82010-06-10 13:36:23 +00003308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003311 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003312 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003313 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003314 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (buffer == NULL)
3316 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003317 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (unicode == NULL)
3319 goto onError;
3320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003322 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003323 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003324 encoding,
3325 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_DECREF(unicode);
3327 goto onError;
3328 }
3329 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003330 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003331
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 Py_XDECREF(buffer);
3334 return NULL;
3335}
3336
Alexander Belopolsky40018472011-02-26 01:02:56 +00003337PyObject *
3338PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003339 const char *encoding,
3340 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003341{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003344 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 }
3346
Serhiy Storchaka00939072016-10-27 21:05:49 +03003347 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3348 "PyUnicode_AsDecodedObject() is deprecated; "
3349 "use PyCodec_Decode() to decode from str", 1) < 0)
3350 return NULL;
3351
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003352 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003354
3355 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003356 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 const char *encoding,
3362 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003363{
3364 PyObject *v;
3365
3366 if (!PyUnicode_Check(unicode)) {
3367 PyErr_BadArgument();
3368 goto onError;
3369 }
3370
Serhiy Storchaka00939072016-10-27 21:05:49 +03003371 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3372 "PyUnicode_AsDecodedUnicode() is deprecated; "
3373 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3374 return NULL;
3375
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378
3379 /* Decode via the codec registry */
3380 v = PyCodec_Decode(unicode, encoding, errors);
3381 if (v == NULL)
3382 goto onError;
3383 if (!PyUnicode_Check(v)) {
3384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003385 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003386 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003387 encoding,
3388 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_DECREF(v);
3390 goto onError;
3391 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003392 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003393
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395 return NULL;
3396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 Py_ssize_t size,
3401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003405
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003406 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3410 Py_DECREF(unicode);
3411 return v;
3412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
3415PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 const char *encoding,
3417 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003418{
3419 PyObject *v;
3420
3421 if (!PyUnicode_Check(unicode)) {
3422 PyErr_BadArgument();
3423 goto onError;
3424 }
3425
Serhiy Storchaka00939072016-10-27 21:05:49 +03003426 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3427 "PyUnicode_AsEncodedObject() is deprecated; "
3428 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3429 "or PyCodec_Encode() for generic encoding", 1) < 0)
3430 return NULL;
3431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003434
3435 /* Encode via the codec registry */
3436 v = PyCodec_Encode(unicode, encoding, errors);
3437 if (v == NULL)
3438 goto onError;
3439 return v;
3440
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003442 return NULL;
3443}
3444
Victor Stinner1b579672011-12-17 05:47:23 +01003445
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003447unicode_encode_locale(PyObject *unicode, const char *errors,
3448 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003450 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003452 Py_ssize_t wlen;
3453 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3454 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003456 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003458 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003459 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003460 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 return NULL;
3462 }
3463
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003464 char *str;
3465 size_t error_pos;
3466 const char *reason;
3467 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003468 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003469 PyMem_Free(wstr);
3470
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003471 if (res != 0) {
3472 if (res == -2) {
3473 PyObject *exc;
3474 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3475 "locale", unicode,
3476 (Py_ssize_t)error_pos,
3477 (Py_ssize_t)(error_pos+1),
3478 reason);
3479 if (exc != NULL) {
3480 PyCodec_StrictErrors(exc);
3481 Py_DECREF(exc);
3482 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003483 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003484 else if (res == -3) {
3485 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3486 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003487 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003490 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003493 PyObject *bytes = PyBytes_FromString(str);
3494 PyMem_RawFree(str);
3495 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496}
3497
Victor Stinnerad158722010-10-27 00:25:46 +00003498PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003499PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3500{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003501 return unicode_encode_locale(unicode, errors, 1);
3502}
3503
3504PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003505PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003506{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003507 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003508 const _PyCoreConfig *config = &interp->core_config;
3509#if defined(__APPLE__)
3510 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3511#else
Victor Stinner793b5312011-04-27 00:24:21 +02003512 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3513 cannot use it to encode and decode filenames before it is loaded. Load
3514 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003515 implementation of the locale codec until the codec registry is
3516 initialized and the Python codec is loaded. See initfsencoding(). */
3517 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003518 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003519 config->filesystem_encoding,
3520 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003521 }
3522 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003524 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003525 }
Victor Stinnerad158722010-10-27 00:25:46 +00003526#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527}
3528
Alexander Belopolsky40018472011-02-26 01:02:56 +00003529PyObject *
3530PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003531 const char *encoding,
3532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533{
3534 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003535 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 if (!PyUnicode_Check(unicode)) {
3538 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
Fred Drakee4315f52000-05-09 19:53:39 +00003541
Victor Stinner942889a2016-09-05 15:40:10 -07003542 if (encoding == NULL) {
3543 return _PyUnicode_AsUTF8String(unicode, errors);
3544 }
3545
Fred Drakee4315f52000-05-09 19:53:39 +00003546 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003547 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3548 char *lower = buflower;
3549
3550 /* Fast paths */
3551 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3552 lower += 3;
3553 if (*lower == '_') {
3554 /* Match "utf8" and "utf_8" */
3555 lower++;
3556 }
3557
3558 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003560 }
3561 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3562 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3563 }
3564 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3565 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3566 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003567 }
Victor Stinner942889a2016-09-05 15:40:10 -07003568 else {
3569 if (strcmp(lower, "ascii") == 0
3570 || strcmp(lower, "us_ascii") == 0) {
3571 return _PyUnicode_AsASCIIString(unicode, errors);
3572 }
Steve Dowercc16be82016-09-08 10:35:16 -07003573#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else if (strcmp(lower, "mbcs") == 0) {
3575 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3576 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003577#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003578 else if (strcmp(lower, "latin1") == 0 ||
3579 strcmp(lower, "latin_1") == 0 ||
3580 strcmp(lower, "iso_8859_1") == 0 ||
3581 strcmp(lower, "iso8859_1") == 0) {
3582 return _PyUnicode_AsLatin1String(unicode, errors);
3583 }
3584 }
Victor Stinner37296e82010-06-10 13:36:23 +00003585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
3587 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003588 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003590 return NULL;
3591
3592 /* The normal path */
3593 if (PyBytes_Check(v))
3594 return v;
3595
3596 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003597 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003598 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003599 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600
3601 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003602 "encoder %s returned bytearray instead of bytes; "
3603 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 encoding);
3605 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 Py_DECREF(v);
3607 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003608 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003610 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3611 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return b;
3614 }
3615
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003622 return NULL;
3623}
3624
Alexander Belopolsky40018472011-02-26 01:02:56 +00003625PyObject *
3626PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003627 const char *encoding,
3628 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629{
3630 PyObject *v;
3631
3632 if (!PyUnicode_Check(unicode)) {
3633 PyErr_BadArgument();
3634 goto onError;
3635 }
3636
Serhiy Storchaka00939072016-10-27 21:05:49 +03003637 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3638 "PyUnicode_AsEncodedUnicode() is deprecated; "
3639 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3640 return NULL;
3641
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003653 encoding,
3654 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2cba6b82018-01-10 22:46:15 +01003664static PyObject*
3665unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3666 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003668 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003669
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003670 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3671 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003672 return NULL;
3673 }
3674
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003675 wchar_t *wstr;
3676 size_t wlen;
3677 const char *reason;
3678 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003679 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003680 if (res != 0) {
3681 if (res == -2) {
3682 PyObject *exc;
3683 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3684 "locale", str, len,
3685 (Py_ssize_t)wlen,
3686 (Py_ssize_t)(wlen + 1),
3687 reason);
3688 if (exc != NULL) {
3689 PyCodec_StrictErrors(exc);
3690 Py_DECREF(exc);
3691 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003693 else if (res == -3) {
3694 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3695 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003696 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003697 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003698 }
Victor Stinner2f197072011-12-17 07:08:30 +01003699 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003700 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003701
3702 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3703 PyMem_RawFree(wstr);
3704 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003705}
3706
3707PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003708PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3709 const char *errors)
3710{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003711 return unicode_decode_locale(str, len, errors, 1);
3712}
3713
3714PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003715PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716{
3717 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719}
3720
3721
3722PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003723PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003724 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003725 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3726}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727
Christian Heimes5894ba72007-11-04 11:43:14 +00003728PyObject*
3729PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3730{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003731 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003732 const _PyCoreConfig *config = &interp->core_config;
3733#if defined(__APPLE__)
3734 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3735#else
Victor Stinner793b5312011-04-27 00:24:21 +02003736 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3737 cannot use it to encode and decode filenames before it is loaded. Load
3738 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003739 implementation of the locale codec until the codec registry is
3740 initialized and the Python codec is loaded. See initfsencoding(). */
3741 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003742 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003743 config->filesystem_encoding,
3744 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 }
3746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003748 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003749 }
Victor Stinnerad158722010-10-27 00:25:46 +00003750#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003751}
3752
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753
3754int
3755PyUnicode_FSConverter(PyObject* arg, void* addr)
3756{
Brett Cannonec6ce872016-09-06 15:50:29 -07003757 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 PyObject *output = NULL;
3759 Py_ssize_t size;
3760 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003761 if (arg == NULL) {
3762 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003763 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003764 return 1;
3765 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003766 path = PyOS_FSPath(arg);
3767 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003768 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003770 if (PyBytes_Check(path)) {
3771 output = path;
3772 }
3773 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3774 output = PyUnicode_EncodeFSDefault(path);
3775 Py_DECREF(path);
3776 if (!output) {
3777 return 0;
3778 }
3779 assert(PyBytes_Check(output));
3780 }
3781
Victor Stinner0ea2a462010-04-30 00:22:08 +00003782 size = PyBytes_GET_SIZE(output);
3783 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003784 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003785 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786 Py_DECREF(output);
3787 return 0;
3788 }
3789 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003790 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003791}
3792
3793
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794int
3795PyUnicode_FSDecoder(PyObject* arg, void* addr)
3796{
Brett Cannona5711202016-09-06 19:36:01 -07003797 int is_buffer = 0;
3798 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003800 if (arg == NULL) {
3801 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003802 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 return 1;
3804 }
Brett Cannona5711202016-09-06 19:36:01 -07003805
3806 is_buffer = PyObject_CheckBuffer(arg);
3807 if (!is_buffer) {
3808 path = PyOS_FSPath(arg);
3809 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003810 return 0;
3811 }
Brett Cannona5711202016-09-06 19:36:01 -07003812 }
3813 else {
3814 path = arg;
3815 Py_INCREF(arg);
3816 }
3817
3818 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003819 output = path;
3820 }
3821 else if (PyBytes_Check(path) || is_buffer) {
3822 PyObject *path_bytes = NULL;
3823
3824 if (!PyBytes_Check(path) &&
3825 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003826 "path should be string, bytes, or os.PathLike, not %.200s",
3827 Py_TYPE(arg)->tp_name)) {
3828 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003829 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003830 }
3831 path_bytes = PyBytes_FromObject(path);
3832 Py_DECREF(path);
3833 if (!path_bytes) {
3834 return 0;
3835 }
3836 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3837 PyBytes_GET_SIZE(path_bytes));
3838 Py_DECREF(path_bytes);
3839 if (!output) {
3840 return 0;
3841 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003842 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003843 else {
3844 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003845 "path should be string, bytes, or os.PathLike, not %.200s",
3846 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003847 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003848 return 0;
3849 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003850 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003851 Py_DECREF(output);
3852 return 0;
3853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003855 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003856 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857 Py_DECREF(output);
3858 return 0;
3859 }
3860 *(PyObject**)addr = output;
3861 return Py_CLEANUP_SUPPORTED;
3862}
3863
3864
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003865const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003867{
Christian Heimesf3863112007-11-22 07:46:41 +00003868 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003870 if (!PyUnicode_Check(unicode)) {
3871 PyErr_BadArgument();
3872 return NULL;
3873 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003877 if (PyUnicode_UTF8(unicode) == NULL) {
3878 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003879 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 if (bytes == NULL)
3881 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3883 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003884 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 Py_DECREF(bytes);
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003889 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 PyBytes_AS_STRING(bytes),
3891 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 Py_DECREF(bytes);
3893 }
3894
3895 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003896 *psize = PyUnicode_UTF8_LENGTH(unicode);
3897 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003898}
3899
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003900const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3904}
3905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906Py_UNICODE *
3907PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 if (!PyUnicode_Check(unicode)) {
3910 PyErr_BadArgument();
3911 return NULL;
3912 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003913 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3914 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003916 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Serhiy Storchakac46db922018-10-23 22:58:24 +03003919 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3920 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3921 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003924 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3925 if (w == NULL) {
3926 PyErr_NoMemory();
3927 return NULL;
3928 }
3929 unicode_copy_as_widechar(unicode, w, wlen + 1);
3930 _PyUnicode_WSTR(unicode) = w;
3931 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3932 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 }
3934 }
3935 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003937 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938}
3939
Alexander Belopolsky40018472011-02-26 01:02:56 +00003940Py_UNICODE *
3941PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944}
3945
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003946const Py_UNICODE *
3947_PyUnicode_AsUnicode(PyObject *unicode)
3948{
3949 Py_ssize_t size;
3950 const Py_UNICODE *wstr;
3951
3952 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3953 if (wstr && wcslen(wstr) != (size_t)size) {
3954 PyErr_SetString(PyExc_ValueError, "embedded null character");
3955 return NULL;
3956 }
3957 return wstr;
3958}
3959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960
Alexander Belopolsky40018472011-02-26 01:02:56 +00003961Py_ssize_t
3962PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 goto onError;
3967 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003968 if (_PyUnicode_WSTR(unicode) == NULL) {
3969 if (PyUnicode_AsUnicode(unicode) == NULL)
3970 goto onError;
3971 }
3972 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return -1;
3976}
3977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978Py_ssize_t
3979PyUnicode_GetLength(PyObject *unicode)
3980{
Victor Stinner07621332012-06-16 04:53:46 +02003981 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 PyErr_BadArgument();
3983 return -1;
3984 }
Victor Stinner07621332012-06-16 04:53:46 +02003985 if (PyUnicode_READY(unicode) == -1)
3986 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return PyUnicode_GET_LENGTH(unicode);
3988}
3989
3990Py_UCS4
3991PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3992{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003993 void *data;
3994 int kind;
3995
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003996 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003997 PyErr_BadArgument();
3998 return (Py_UCS4)-1;
3999 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004000 if (PyUnicode_READY(unicode) == -1) {
4001 return (Py_UCS4)-1;
4002 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004003 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004004 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return (Py_UCS4)-1;
4006 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004007 data = PyUnicode_DATA(unicode);
4008 kind = PyUnicode_KIND(unicode);
4009 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010}
4011
4012int
4013PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4014{
4015 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004016 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return -1;
4018 }
Victor Stinner488fa492011-12-12 00:01:39 +01004019 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004020 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004021 PyErr_SetString(PyExc_IndexError, "string index out of range");
4022 return -1;
4023 }
Victor Stinner488fa492011-12-12 00:01:39 +01004024 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004025 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004026 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4027 PyErr_SetString(PyExc_ValueError, "character out of range");
4028 return -1;
4029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4031 index, ch);
4032 return 0;
4033}
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035const char *
4036PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004037{
Victor Stinner42cb4622010-09-01 19:39:01 +00004038 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004039}
4040
Victor Stinner554f3f02010-06-16 23:33:54 +00004041/* create or adjust a UnicodeDecodeError */
4042static void
4043make_decode_exception(PyObject **exceptionObject,
4044 const char *encoding,
4045 const char *input, Py_ssize_t length,
4046 Py_ssize_t startpos, Py_ssize_t endpos,
4047 const char *reason)
4048{
4049 if (*exceptionObject == NULL) {
4050 *exceptionObject = PyUnicodeDecodeError_Create(
4051 encoding, input, length, startpos, endpos, reason);
4052 }
4053 else {
4054 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4055 goto onError;
4056 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4057 goto onError;
4058 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4059 goto onError;
4060 }
4061 return;
4062
4063onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004064 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004065}
4066
Steve Dowercc16be82016-09-08 10:35:16 -07004067#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004068static int
4069widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4070{
4071 if (newsize > *size) {
4072 wchar_t *newbuf = *buf;
4073 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4074 PyErr_NoMemory();
4075 return -1;
4076 }
4077 *buf = newbuf;
4078 }
4079 *size = newsize;
4080 return 0;
4081}
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083/* error handling callback helper:
4084 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004085 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 and adjust various state variables.
4087 return 0 on success, -1 on error
4088*/
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004091unicode_decode_call_errorhandler_wchar(
4092 const char *errors, PyObject **errorHandler,
4093 const char *encoding, const char *reason,
4094 const char **input, const char **inend, Py_ssize_t *startinpos,
4095 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004096 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004098 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099
4100 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004103 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 wchar_t *repwstr;
4108 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004131 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133
4134 /* Copy back the bytes variables, which might have been modified by the
4135 callback */
4136 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4137 if (!inputobj)
4138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 *input = PyBytes_AS_STRING(inputobj);
4140 insize = PyBytes_GET_SIZE(inputobj);
4141 *inend = *input + insize;
4142 /* we can DECREF safely, as the exception has another reference,
4143 so the object won't go away. */
4144 Py_DECREF(inputobj);
4145
4146 if (newpos<0)
4147 newpos = insize+newpos;
4148 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004149 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150 goto onError;
4151 }
4152
4153 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4154 if (repwstr == NULL)
4155 goto onError;
4156 /* need more space? (at least enough for what we
4157 have+the replacement+the rest of the string (starting
4158 at the new input position), so we won't have to check space
4159 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004160 requiredsize = *outpos;
4161 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4162 goto overflow;
4163 requiredsize += repwlen;
4164 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4165 goto overflow;
4166 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004167 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004169 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004171 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004173 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004175 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 *endinpos = newpos;
4178 *inptr = *input + newpos;
4179
4180 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004181 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 return 0;
4183
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004184 overflow:
4185 PyErr_SetString(PyExc_OverflowError,
4186 "decoded result is too long for a Python string");
4187
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 onError:
4189 Py_XDECREF(restuple);
4190 return -1;
4191}
Steve Dowercc16be82016-09-08 10:35:16 -07004192#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
4194static int
4195unicode_decode_call_errorhandler_writer(
4196 const char *errors, PyObject **errorHandler,
4197 const char *encoding, const char *reason,
4198 const char **input, const char **inend, Py_ssize_t *startinpos,
4199 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4200 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4201{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004202 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203
4204 PyObject *restuple = NULL;
4205 PyObject *repunicode = NULL;
4206 Py_ssize_t insize;
4207 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004208 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004209 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004211 int need_to_grow = 0;
4212 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213
4214 if (*errorHandler == NULL) {
4215 *errorHandler = PyCodec_LookupError(errors);
4216 if (*errorHandler == NULL)
4217 goto onError;
4218 }
4219
4220 make_decode_exception(exceptionObject,
4221 encoding,
4222 *input, *inend - *input,
4223 *startinpos, *endinpos,
4224 reason);
4225 if (*exceptionObject == NULL)
4226 goto onError;
4227
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004228 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 if (restuple == NULL)
4230 goto onError;
4231 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 goto onError;
4234 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004235 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004237
4238 /* Copy back the bytes variables, which might have been modified by the
4239 callback */
4240 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4241 if (!inputobj)
4242 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004243 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004244 *input = PyBytes_AS_STRING(inputobj);
4245 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004246 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004247 /* we can DECREF safely, as the exception has another reference,
4248 so the object won't go away. */
4249 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004253 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Victor Stinner170ca6f2013-04-18 00:25:28 +02004258 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004259 if (replen > 1) {
4260 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004261 need_to_grow = 1;
4262 }
4263 new_inptr = *input + newpos;
4264 if (*inend - new_inptr > remain) {
4265 /* We don't know the decoding algorithm here so we make the worst
4266 assumption that one byte decodes to one unicode character.
4267 If unfortunately one byte could decode to more unicode characters,
4268 the decoder may write out-of-bound then. Is it possible for the
4269 algorithms using this function? */
4270 writer->min_length += *inend - new_inptr - remain;
4271 need_to_grow = 1;
4272 }
4273 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004274 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004275 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004276 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4277 goto onError;
4278 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004280 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004283 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004286 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292}
4293
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294/* --- UTF-7 Codec -------------------------------------------------------- */
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* See RFC2152 for details. We encode conservatively and decode liberally. */
4297
4298/* Three simple macros defining base-64. */
4299
4300/* Is c a base-64 character? */
4301
4302#define IS_BASE64(c) \
4303 (((c) >= 'A' && (c) <= 'Z') || \
4304 ((c) >= 'a' && (c) <= 'z') || \
4305 ((c) >= '0' && (c) <= '9') || \
4306 (c) == '+' || (c) == '/')
4307
4308/* given that c is a base-64 character, what is its base-64 value? */
4309
4310#define FROM_BASE64(c) \
4311 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4312 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4313 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4314 (c) == '+' ? 62 : 63)
4315
4316/* What is the base-64 character of the bottom 6 bits of n? */
4317
4318#define TO_BASE64(n) \
4319 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4320
4321/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4322 * decoded as itself. We are permissive on decoding; the only ASCII
4323 * byte not decoding to itself is the + which begins a base64
4324 * string. */
4325
4326#define DECODE_DIRECT(c) \
4327 ((c) <= 127 && (c) != '+')
4328
4329/* The UTF-7 encoder treats ASCII characters differently according to
4330 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4331 * the above). See RFC2152. This array identifies these different
4332 * sets:
4333 * 0 : "Set D"
4334 * alphanumeric and '(),-./:?
4335 * 1 : "Set O"
4336 * !"#$%&*;<=>@[]^_`{|}
4337 * 2 : "whitespace"
4338 * ht nl cr sp
4339 * 3 : special (must be base64 encoded)
4340 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4341 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Tim Petersced69f82003-09-16 20:30:58 +00004343static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344char utf7_category[128] = {
4345/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4346 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4347/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4348 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4349/* sp ! " # $ % & ' ( ) * + , - . / */
4350 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4351/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4353/* @ A B C D E F G H I J K L M N O */
4354 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4357/* ` a b c d e f g h i j k l m n o */
4358 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4359/* p q r s t u v w x y z { | } ~ del */
4360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361};
4362
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363/* ENCODE_DIRECT: this character should be encoded as itself. The
4364 * answer depends on whether we are encoding set O as itself, and also
4365 * on whether we are encoding whitespace as itself. RFC2152 makes it
4366 * clear that the answers to these questions vary between
4367 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004368
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369#define ENCODE_DIRECT(c, directO, directWS) \
4370 ((c) < 128 && (c) > 0 && \
4371 ((utf7_category[(c)] == 0) || \
4372 (directWS && (utf7_category[(c)] == 2)) || \
4373 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375PyObject *
4376PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004377 Py_ssize_t size,
4378 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004380 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4381}
4382
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383/* The decoder. The only state we preserve is our read position,
4384 * i.e. how many characters we have consumed. So if we end in the
4385 * middle of a shift sequence we have to back off the read position
4386 * and the output to the beginning of the sequence, otherwise we lose
4387 * all the shift state (seen bits, number of bits seen, high
4388 * surrogate). */
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
4391PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004392 Py_ssize_t size,
4393 const char *errors,
4394 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t startinpos;
4398 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 const char *errmsg = "";
4402 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 unsigned int base64bits = 0;
4405 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004406 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 PyObject *errorHandler = NULL;
4408 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004410 if (size == 0) {
4411 if (consumed)
4412 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004413 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004417 _PyUnicodeWriter_Init(&writer);
4418 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419
4420 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 e = s + size;
4422
4423 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004426 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 if (inShift) { /* in a base-64 section */
4429 if (IS_BASE64(ch)) { /* consume a base-64 character */
4430 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4431 base64bits += 6;
4432 s++;
4433 if (base64bits >= 16) {
4434 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 base64bits -= 16;
4437 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004438 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (surrogate) {
4440 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004441 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4442 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004446 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004449 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004450 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
4453 }
Victor Stinner551ac952011-11-29 22:58:13 +01004454 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* first surrogate */
4456 surrogate = outCh;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004459 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
4462 }
4463 }
4464 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (base64bits > 0) { /* left-over bits */
4467 if (base64bits >= 6) {
4468 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004469 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 errmsg = "partial character in shift sequence";
4471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 else {
4474 /* Some bits remain; they should be zero */
4475 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004476 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 errmsg = "non-zero padding bits in shift sequence";
4478 goto utf7Error;
4479 }
4480 }
4481 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004482 if (surrogate && DECODE_DIRECT(ch)) {
4483 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4484 goto onError;
4485 }
4486 surrogate = 0;
4487 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* '-' is absorbed; other terminating
4489 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004490 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 }
4494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 s++; /* consume '+' */
4497 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004502 else if (s < e && !IS_BASE64(*s)) {
4503 s++;
4504 errmsg = "ill-formed sequence";
4505 goto utf7Error;
4506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004509 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004512 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
4514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else {
4521 startinpos = s-starts;
4522 s++;
4523 errmsg = "unexpected special character";
4524 goto utf7Error;
4525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 errors, &errorHandler,
4531 "utf7", errmsg,
4532 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* end of string */
4538
4539 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4540 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (surrogate ||
4543 (base64bits >= 6) ||
4544 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 errors, &errorHandler,
4548 "utf7", "unterminated shift sequence",
4549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 goto onError;
4552 if (s < e)
4553 goto restart;
4554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556
4557 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004558 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004561 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.kind, writer.data, shiftOutStart);
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 _PyUnicodeWriter_Dealloc(&writer);
4567 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_XDECREF(errorHandler);
4582 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 return NULL;
4585}
4586
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589_PyUnicode_EncodeUTF7(PyObject *str,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004594 int kind;
4595 void *data;
4596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 char * out;
4603 char * start;
4604
Benjamin Petersonbac79492012-01-14 13:34:47 -05004605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004606 return NULL;
4607 kind = PyUnicode_KIND(str);
4608 data = PyUnicode_DATA(str);
4609 len = PyUnicode_GET_LENGTH(str);
4610
4611 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004615 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004616 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004617 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 if (v == NULL)
4619 return NULL;
4620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (inShift) {
4626 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4627 /* shifting out */
4628 if (base64bits) { /* output remaining bits */
4629 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4630 base64buffer = 0;
4631 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
4633 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* Characters not in the BASE64 set implicitly unshift the sequence
4635 so no '-' is required, except if the character is itself a '-' */
4636 if (IS_BASE64(ch) || ch == '-') {
4637 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 *out++ = (char) ch;
4640 }
4641 else {
4642 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* not in a shift sequence */
4646 if (ch == '+') {
4647 *out++ = '+';
4648 *out++ = '-';
4649 }
4650 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 *out++ = '+';
4655 inShift = 1;
4656 goto encode_char;
4657 }
4658 }
4659 continue;
4660encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004662 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* code first surrogate */
4665 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004666 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 while (base64bits >= 6) {
4668 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4669 base64bits -= 6;
4670 }
4671 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004672 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 base64bits += 16;
4675 base64buffer = (base64buffer << 16) | ch;
4676 while (base64bits >= 6) {
4677 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4678 base64bits -= 6;
4679 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 if (base64bits)
4682 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4683 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 if (_PyBytes_Resize(&v, out - start) < 0)
4686 return NULL;
4687 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004689PyObject *
4690PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4691 Py_ssize_t size,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
4695{
4696 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004697 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004698 if (tmp == NULL)
4699 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004700 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 base64WhiteSpace, errors);
4702 Py_DECREF(tmp);
4703 return result;
4704}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706#undef IS_BASE64
4707#undef FROM_BASE64
4708#undef TO_BASE64
4709#undef DECODE_DIRECT
4710#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712/* --- UTF-8 Codec -------------------------------------------------------- */
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
4715PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004716 Py_ssize_t size,
4717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald69652032004-09-07 20:24:22 +00004719 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4720}
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722#include "stringlib/asciilib.h"
4723#include "stringlib/codecs.h"
4724#include "stringlib/undef.h"
4725
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726#include "stringlib/ucs1lib.h"
4727#include "stringlib/codecs.h"
4728#include "stringlib/undef.h"
4729
4730#include "stringlib/ucs2lib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
4734#include "stringlib/ucs4lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738/* Mask to quickly check whether a C 'long' contains a
4739 non-ASCII, UTF8-encoded char. */
4740#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004741# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004742#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004743# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004744#else
4745# error C 'long' size should be either 4 or 8!
4746#endif
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748static Py_ssize_t
4749ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004752 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004754 /*
4755 * Issue #17237: m68k is a bit different from most architectures in
4756 * that objects do not use "natural alignment" - for example, int and
4757 * long are only aligned at 2-byte boundaries. Therefore the assert()
4758 * won't work; also, tests have shown that skipping the "optimised
4759 * version" will even speed up m68k.
4760 */
4761#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004763 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4764 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 /* Fast path, see in STRINGLIB(utf8_decode) for
4766 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004767 /* Help allocation */
4768 const char *_p = p;
4769 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (_p < aligned_end) {
4771 unsigned long value = *(const unsigned long *) _p;
4772 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 *((unsigned long *)q) = value;
4775 _p += SIZEOF_LONG;
4776 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 p = _p;
4779 while (p < end) {
4780 if ((unsigned char)*p & 0x80)
4781 break;
4782 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004787#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (p < end) {
4789 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4790 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004792 /* Help allocation */
4793 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
4797 break;
4798 _p += SIZEOF_LONG;
4799 }
4800 p = _p;
4801 if (_p == end)
4802 break;
4803 }
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 ++p;
4807 }
4808 memcpy(dest, start, p - start);
4809 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
Antoine Pitrouab868312009-01-10 15:40:25 +00004811
Victor Stinner785938e2011-12-11 20:09:03 +01004812PyObject *
4813PyUnicode_DecodeUTF8Stateful(const char *s,
4814 Py_ssize_t size,
4815 const char *errors,
4816 Py_ssize_t *consumed)
4817{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004818 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004819 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821
4822 Py_ssize_t startinpos;
4823 Py_ssize_t endinpos;
4824 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004825 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004827 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
4829 if (size == 0) {
4830 if (consumed)
4831 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004833 }
4834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4836 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004837 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 *consumed = 1;
4839 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004840 }
4841
Victor Stinner8f674cc2013-04-17 23:02:17 +02004842 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004843 writer.min_length = size;
4844 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004846
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004847 writer.pos = ascii_decode(s, end, writer.data);
4848 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 while (s < end) {
4850 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004852
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 if (PyUnicode_IS_ASCII(writer.buffer))
4855 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 } else {
4861 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 }
4864
4865 switch (ch) {
4866 case 0:
4867 if (s == end || consumed)
4868 goto End;
4869 errmsg = "unexpected end of data";
4870 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004871 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 break;
4873 case 1:
4874 errmsg = "invalid start byte";
4875 startinpos = s - starts;
4876 endinpos = startinpos + 1;
4877 break;
4878 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 case 3:
4880 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 errmsg = "invalid continuation byte";
4882 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004883 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 break;
4885 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004886 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 goto onError;
4888 continue;
4889 }
4890
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004892 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004893
4894 switch (error_handler) {
4895 case _Py_ERROR_IGNORE:
4896 s += (endinpos - startinpos);
4897 break;
4898
4899 case _Py_ERROR_REPLACE:
4900 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4901 goto onError;
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004906 {
4907 Py_ssize_t i;
4908
Victor Stinner1d65d912015-10-05 13:43:50 +02004909 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4910 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004911 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004912 ch = (Py_UCS4)(unsigned char)(starts[i]);
4913 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4914 ch + 0xdc00);
4915 writer.pos++;
4916 }
4917 s += (endinpos - startinpos);
4918 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004919 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004920
4921 default:
4922 if (unicode_decode_call_errorhandler_writer(
4923 errors, &error_handler_obj,
4924 "utf-8", errmsg,
4925 &starts, &end, &startinpos, &endinpos, &exc, &s,
4926 &writer))
4927 goto onError;
4928 }
Victor Stinner785938e2011-12-11 20:09:03 +01004929 }
4930
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 if (consumed)
4933 *consumed = s - starts;
4934
Victor Stinner1d65d912015-10-05 13:43:50 +02004935 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004937 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938
4939onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004940 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004942 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004944}
4945
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004947/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4948 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004949
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004950 On success, write a pointer to a newly allocated wide character string into
4951 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4952 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 On memory allocation failure, return -1.
4955
4956 On decoding error (if surrogateescape is zero), return -2. If wlen is
4957 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4958 is not NULL, write the decoding error message into *reason. */
4959int
4960_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004961 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004962{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004963 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004964 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 wchar_t *unicode;
4966 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967
Victor Stinner3d4226a2018-08-29 22:21:32 +02004968 int surrogateescape = 0;
4969 int surrogatepass = 0;
4970 switch (errors)
4971 {
4972 case _Py_ERROR_STRICT:
4973 break;
4974 case _Py_ERROR_SURROGATEESCAPE:
4975 surrogateescape = 1;
4976 break;
4977 case _Py_ERROR_SURROGATEPASS:
4978 surrogatepass = 1;
4979 break;
4980 default:
4981 return -3;
4982 }
4983
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984 /* Note: size will always be longer than the resulting Unicode
4985 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004986 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004987 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004988 }
4989
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004990 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004991 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004992 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004993 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994
4995 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (ch > 0xFF) {
5006#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005007 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005009 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005010 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5012 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5013#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005014 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005016 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005018 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005019
5020 if (surrogateescape) {
5021 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5022 }
5023 else {
5024 /* Is it a valid three-byte code? */
5025 if (surrogatepass
5026 && (e - s) >= 3
5027 && (s[0] & 0xf0) == 0xe0
5028 && (s[1] & 0xc0) == 0x80
5029 && (s[2] & 0xc0) == 0x80)
5030 {
5031 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5032 s += 3;
5033 unicode[outpos++] = ch;
5034 }
5035 else {
5036 PyMem_RawFree(unicode );
5037 if (reason != NULL) {
5038 switch (ch) {
5039 case 0:
5040 *reason = "unexpected end of data";
5041 break;
5042 case 1:
5043 *reason = "invalid start byte";
5044 break;
5045 /* 2, 3, 4 */
5046 default:
5047 *reason = "invalid continuation byte";
5048 break;
5049 }
5050 }
5051 if (wlen != NULL) {
5052 *wlen = s - orig_s;
5053 }
5054 return -2;
5055 }
5056 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005060 if (wlen) {
5061 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005062 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005063 *wstr = unicode;
5064 return 0;
5065}
5066
Victor Stinner5f9cf232019-03-19 01:46:25 +01005067
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005068wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005069_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5070 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005071{
5072 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005073 int res = _Py_DecodeUTF8Ex(arg, arglen,
5074 &wstr, wlen,
5075 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005077 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5078 assert(res != -3);
5079 if (wlen) {
5080 *wlen = (size_t)res;
5081 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005082 return NULL;
5083 }
5084 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005085}
5086
Antoine Pitrouab868312009-01-10 15:40:25 +00005087
Victor Stinnere47e6982017-12-21 15:45:16 +01005088/* UTF-8 encoder using the surrogateescape error handler .
5089
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005090 On success, return 0 and write the newly allocated character string (use
5091 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005092
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005093 On encoding failure, return -2 and write the position of the invalid
5094 surrogate character into *error_pos (if error_pos is set) and the decoding
5095 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005096
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005097 On memory allocation failure, return -1. */
5098int
5099_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005100 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005101{
5102 const Py_ssize_t max_char_size = 4;
5103 Py_ssize_t len = wcslen(text);
5104
5105 assert(len >= 0);
5106
Victor Stinner3d4226a2018-08-29 22:21:32 +02005107 int surrogateescape = 0;
5108 int surrogatepass = 0;
5109 switch (errors)
5110 {
5111 case _Py_ERROR_STRICT:
5112 break;
5113 case _Py_ERROR_SURROGATEESCAPE:
5114 surrogateescape = 1;
5115 break;
5116 case _Py_ERROR_SURROGATEPASS:
5117 surrogatepass = 1;
5118 break;
5119 default:
5120 return -3;
5121 }
5122
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005123 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5124 return -1;
5125 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005126 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005127 if (raw_malloc) {
5128 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005129 }
5130 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005131 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005132 }
5133 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005134 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005135 }
5136
5137 char *p = bytes;
5138 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005139 for (i = 0; i < len; ) {
5140 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005141 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005142 i++;
5143#if Py_UNICODE_SIZE == 2
5144 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5145 && i < len
5146 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5147 {
5148 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5149 i++;
5150 }
5151#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005152
5153 if (ch < 0x80) {
5154 /* Encode ASCII */
5155 *p++ = (char) ch;
5156
5157 }
5158 else if (ch < 0x0800) {
5159 /* Encode Latin-1 */
5160 *p++ = (char)(0xc0 | (ch >> 6));
5161 *p++ = (char)(0x80 | (ch & 0x3f));
5162 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005163 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005164 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005165 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005166 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005167 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005168 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005169 if (reason != NULL) {
5170 *reason = "encoding error";
5171 }
5172 if (raw_malloc) {
5173 PyMem_RawFree(bytes);
5174 }
5175 else {
5176 PyMem_Free(bytes);
5177 }
5178 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005179 }
5180 *p++ = (char)(ch & 0xff);
5181 }
5182 else if (ch < 0x10000) {
5183 *p++ = (char)(0xe0 | (ch >> 12));
5184 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5185 *p++ = (char)(0x80 | (ch & 0x3f));
5186 }
5187 else { /* ch >= 0x10000 */
5188 assert(ch <= MAX_UNICODE);
5189 /* Encode UCS4 Unicode ordinals */
5190 *p++ = (char)(0xf0 | (ch >> 18));
5191 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5192 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5193 *p++ = (char)(0x80 | (ch & 0x3f));
5194 }
5195 }
5196 *p++ = '\0';
5197
5198 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005199 char *bytes2;
5200 if (raw_malloc) {
5201 bytes2 = PyMem_RawRealloc(bytes, final_size);
5202 }
5203 else {
5204 bytes2 = PyMem_Realloc(bytes, final_size);
5205 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005206 if (bytes2 == NULL) {
5207 if (error_pos != NULL) {
5208 *error_pos = (size_t)-1;
5209 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210 if (raw_malloc) {
5211 PyMem_RawFree(bytes);
5212 }
5213 else {
5214 PyMem_Free(bytes);
5215 }
5216 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005217 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218 *str = bytes2;
5219 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005220}
5221
5222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223/* Primary internal function which creates utf8 encoded bytes objects.
5224
5225 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005226 and allocate exactly as much space needed at the end. Else allocate the
5227 maximum possible needed (4 result bytes per Unicode character), and return
5228 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005229*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005230PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005231_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232{
Victor Stinner6099a032011-12-18 14:22:26 +01005233 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005234 void *data;
5235 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005237 if (!PyUnicode_Check(unicode)) {
5238 PyErr_BadArgument();
5239 return NULL;
5240 }
5241
5242 if (PyUnicode_READY(unicode) == -1)
5243 return NULL;
5244
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005245 if (PyUnicode_UTF8(unicode))
5246 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5247 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005248
5249 kind = PyUnicode_KIND(unicode);
5250 data = PyUnicode_DATA(unicode);
5251 size = PyUnicode_GET_LENGTH(unicode);
5252
Benjamin Petersonead6b532011-12-20 17:23:42 -06005253 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005254 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005255 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005256 case PyUnicode_1BYTE_KIND:
5257 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5258 assert(!PyUnicode_IS_ASCII(unicode));
5259 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5260 case PyUnicode_2BYTE_KIND:
5261 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5262 case PyUnicode_4BYTE_KIND:
5263 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
Alexander Belopolsky40018472011-02-26 01:02:56 +00005267PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005268PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5269 Py_ssize_t size,
5270 const char *errors)
5271{
5272 PyObject *v, *unicode;
5273
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005274 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005275 if (unicode == NULL)
5276 return NULL;
5277 v = _PyUnicode_AsUTF8String(unicode, errors);
5278 Py_DECREF(unicode);
5279 return v;
5280}
5281
5282PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005283PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005285 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286}
5287
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288/* --- UTF-32 Codec ------------------------------------------------------- */
5289
5290PyObject *
5291PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 Py_ssize_t size,
5293 const char *errors,
5294 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005295{
5296 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5297}
5298
5299PyObject *
5300PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 Py_ssize_t size,
5302 const char *errors,
5303 int *byteorder,
5304 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305{
5306 const char *starts = s;
5307 Py_ssize_t startinpos;
5308 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005309 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005310 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005311 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314 PyObject *errorHandler = NULL;
5315 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005316
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317 q = (unsigned char *)s;
5318 e = q + size;
5319
5320 if (byteorder)
5321 bo = *byteorder;
5322
5323 /* Check for BOM marks (U+FEFF) in the input and adjust current
5324 byte order setting accordingly. In native mode, the leading BOM
5325 mark is skipped, in all other modes, it is copied to the output
5326 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005328 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005329 if (bom == 0x0000FEFF) {
5330 bo = -1;
5331 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 else if (bom == 0xFFFE0000) {
5334 bo = 1;
5335 q += 4;
5336 }
5337 if (byteorder)
5338 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005339 }
5340
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 if (q == e) {
5342 if (consumed)
5343 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005344 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005345 }
5346
Victor Stinnere64322e2012-10-30 23:12:47 +01005347#ifdef WORDS_BIGENDIAN
5348 le = bo < 0;
5349#else
5350 le = bo <= 0;
5351#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005352 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005353
Victor Stinner8f674cc2013-04-17 23:02:17 +02005354 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005355 writer.min_length = (e - q + 3) / 4;
5356 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005358
Victor Stinnere64322e2012-10-30 23:12:47 +01005359 while (1) {
5360 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005362
Victor Stinnere64322e2012-10-30 23:12:47 +01005363 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 enum PyUnicode_Kind kind = writer.kind;
5365 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005366 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005368 if (le) {
5369 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005370 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005371 if (ch > maxch)
5372 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005373 if (kind != PyUnicode_1BYTE_KIND &&
5374 Py_UNICODE_IS_SURROGATE(ch))
5375 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005377 q += 4;
5378 } while (q <= last);
5379 }
5380 else {
5381 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005382 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005383 if (ch > maxch)
5384 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005385 if (kind != PyUnicode_1BYTE_KIND &&
5386 Py_UNICODE_IS_SURROGATE(ch))
5387 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005388 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005389 q += 4;
5390 } while (q <= last);
5391 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005392 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005393 }
5394
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005395 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005396 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 startinpos = ((const char *)q) - starts;
5398 endinpos = startinpos + 4;
5399 }
5400 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005401 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005403 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005405 startinpos = ((const char *)q) - starts;
5406 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005408 else {
5409 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005410 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005411 goto onError;
5412 q += 4;
5413 continue;
5414 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005415 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005416 startinpos = ((const char *)q) - starts;
5417 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005419
5420 /* The remaining input chars are ignored if the callback
5421 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005422 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005428 }
5429
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005432
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433 Py_XDECREF(errorHandler);
5434 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005435 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005436
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005438 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439 Py_XDECREF(errorHandler);
5440 Py_XDECREF(exc);
5441 return NULL;
5442}
5443
5444PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445_PyUnicode_EncodeUTF32(PyObject *str,
5446 const char *errors,
5447 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005448{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 enum PyUnicode_Kind kind;
5450 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005451 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005452 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005453 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005455 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005456#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005457 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
5474
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005475 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005476 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005478 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005479 if (v == NULL)
5480 return NULL;
5481
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005482 /* output buffer is 4-bytes aligned */
5483 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005484 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005485 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005487 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005488 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005489
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005490 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005492 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005493 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 else
5495 encoding = "utf-32";
5496
5497 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5499 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005500 }
5501
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 pos = 0;
5503 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005504 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505
5506 if (kind == PyUnicode_2BYTE_KIND) {
5507 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5508 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005510 else {
5511 assert(kind == PyUnicode_4BYTE_KIND);
5512 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5513 &out, native_ordering);
5514 }
5515 if (pos == len)
5516 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005517
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005518 rep = unicode_encode_call_errorhandler(
5519 errors, &errorHandler,
5520 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005521 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005522 if (!rep)
5523 goto error;
5524
5525 if (PyBytes_Check(rep)) {
5526 repsize = PyBytes_GET_SIZE(rep);
5527 if (repsize & 3) {
5528 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005529 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005530 "surrogates not allowed");
5531 goto error;
5532 }
5533 moreunits = repsize / 4;
5534 }
5535 else {
5536 assert(PyUnicode_Check(rep));
5537 if (PyUnicode_READY(rep) < 0)
5538 goto error;
5539 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5540 if (!PyUnicode_IS_ASCII(rep)) {
5541 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005542 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 "surrogates not allowed");
5544 goto error;
5545 }
5546 }
5547
5548 /* four bytes are reserved for each surrogate */
5549 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005550 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005551 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005552 /* integer overflow */
5553 PyErr_NoMemory();
5554 goto error;
5555 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005556 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005557 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005558 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 }
5560
5561 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005562 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005563 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005564 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005565 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005566 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5567 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 }
5569
5570 Py_CLEAR(rep);
5571 }
5572
5573 /* Cut back to size actually needed. This is necessary for, for example,
5574 encoding of a string containing isolated surrogates and the 'ignore'
5575 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005576 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 if (nsize != PyBytes_GET_SIZE(v))
5578 _PyBytes_Resize(&v, nsize);
5579 Py_XDECREF(errorHandler);
5580 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005581 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005582 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005583 error:
5584 Py_XDECREF(rep);
5585 Py_XDECREF(errorHandler);
5586 Py_XDECREF(exc);
5587 Py_XDECREF(v);
5588 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589}
5590
Alexander Belopolsky40018472011-02-26 01:02:56 +00005591PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005592PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5593 Py_ssize_t size,
5594 const char *errors,
5595 int byteorder)
5596{
5597 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005598 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005599 if (tmp == NULL)
5600 return NULL;
5601 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5602 Py_DECREF(tmp);
5603 return result;
5604}
5605
5606PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005607PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005608{
Victor Stinnerb960b342011-11-20 19:12:52 +01005609 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005610}
5611
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612/* --- UTF-16 Codec ------------------------------------------------------- */
5613
Tim Peters772747b2001-08-09 22:21:55 +00005614PyObject *
5615PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 Py_ssize_t size,
5617 const char *errors,
5618 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619{
Walter Dörwald69652032004-09-07 20:24:22 +00005620 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5621}
5622
5623PyObject *
5624PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 Py_ssize_t size,
5626 const char *errors,
5627 int *byteorder,
5628 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005629{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005631 Py_ssize_t startinpos;
5632 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005635 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005637 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005638 PyObject *errorHandler = NULL;
5639 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005640 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Tim Peters772747b2001-08-09 22:21:55 +00005642 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
5645 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005646 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005648 /* Check for BOM marks (U+FEFF) in the input and adjust current
5649 byte order setting accordingly. In native mode, the leading BOM
5650 mark is skipped, in all other modes, it is copied to the output
5651 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005652 if (bo == 0 && size >= 2) {
5653 const Py_UCS4 bom = (q[1] << 8) | q[0];
5654 if (bom == 0xFEFF) {
5655 q += 2;
5656 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005658 else if (bom == 0xFFFE) {
5659 q += 2;
5660 bo = 1;
5661 }
5662 if (byteorder)
5663 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
Antoine Pitrou63065d72012-05-15 23:48:04 +02005666 if (q == e) {
5667 if (consumed)
5668 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005669 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005670 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671
Christian Heimes743e0cd2012-10-17 23:52:17 +02005672#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005673 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005675#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005676 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005677 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005678#endif
Tim Peters772747b2001-08-09 22:21:55 +00005679
Antoine Pitrou63065d72012-05-15 23:48:04 +02005680 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005681 character count normally. Error handler will take care of
5682 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005683 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005684 writer.min_length = (e - q + 1) / 2;
5685 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005687
Antoine Pitrou63065d72012-05-15 23:48:04 +02005688 while (1) {
5689 Py_UCS4 ch = 0;
5690 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005692 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005694 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005695 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005696 native_ordering);
5697 else
5698 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005700 native_ordering);
5701 } else if (kind == PyUnicode_2BYTE_KIND) {
5702 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005704 native_ordering);
5705 } else {
5706 assert(kind == PyUnicode_4BYTE_KIND);
5707 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005708 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005709 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005710 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712
Antoine Pitrou63065d72012-05-15 23:48:04 +02005713 switch (ch)
5714 {
5715 case 0:
5716 /* remaining byte at the end? (size should be even) */
5717 if (q == e || consumed)
5718 goto End;
5719 errmsg = "truncated data";
5720 startinpos = ((const char *)q) - starts;
5721 endinpos = ((const char *)e) - starts;
5722 break;
5723 /* The remaining input chars are ignored if the callback
5724 chooses to skip the input */
5725 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005726 q -= 2;
5727 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005728 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005729 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005730 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005731 endinpos = ((const char *)e) - starts;
5732 break;
5733 case 2:
5734 errmsg = "illegal encoding";
5735 startinpos = ((const char *)q) - 2 - starts;
5736 endinpos = startinpos + 2;
5737 break;
5738 case 3:
5739 errmsg = "illegal UTF-16 surrogate";
5740 startinpos = ((const char *)q) - 4 - starts;
5741 endinpos = startinpos + 2;
5742 break;
5743 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005744 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 continue;
5747 }
5748
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005749 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005750 errors,
5751 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005753 &starts,
5754 (const char **)&e,
5755 &startinpos,
5756 &endinpos,
5757 &exc,
5758 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005759 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
5762
Antoine Pitrou63065d72012-05-15 23:48:04 +02005763End:
Walter Dörwald69652032004-09-07 20:24:22 +00005764 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 Py_XDECREF(errorHandler);
5768 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005769 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 Py_XDECREF(errorHandler);
5774 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 return NULL;
5776}
5777
Tim Peters772747b2001-08-09 22:21:55 +00005778PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005779_PyUnicode_EncodeUTF16(PyObject *str,
5780 const char *errors,
5781 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005783 enum PyUnicode_Kind kind;
5784 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005786 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005787 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005788 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005789#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005790 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005791#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005792 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005793#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005794 const char *encoding;
5795 Py_ssize_t nsize, pos;
5796 PyObject *errorHandler = NULL;
5797 PyObject *exc = NULL;
5798 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005799
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005800 if (!PyUnicode_Check(str)) {
5801 PyErr_BadArgument();
5802 return NULL;
5803 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005804 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005805 return NULL;
5806 kind = PyUnicode_KIND(str);
5807 data = PyUnicode_DATA(str);
5808 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005809
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005810 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005811 if (kind == PyUnicode_4BYTE_KIND) {
5812 const Py_UCS4 *in = (const Py_UCS4 *)data;
5813 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005814 while (in < end) {
5815 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005816 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005817 }
5818 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005819 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005820 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005822 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005823 nsize = len + pairs + (byteorder == 0);
5824 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005825 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005829 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005830 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005831 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005832 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005833 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005834 }
5835 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005836 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005837 }
Tim Peters772747b2001-08-09 22:21:55 +00005838
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 if (kind == PyUnicode_1BYTE_KIND) {
5840 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5841 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005842 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005843
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005844 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005846 }
5847 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005848 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005849 }
5850 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005852 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005853
5854 pos = 0;
5855 while (pos < len) {
5856 Py_ssize_t repsize, moreunits;
5857
5858 if (kind == PyUnicode_2BYTE_KIND) {
5859 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5860 &out, native_ordering);
5861 }
5862 else {
5863 assert(kind == PyUnicode_4BYTE_KIND);
5864 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5865 &out, native_ordering);
5866 }
5867 if (pos == len)
5868 break;
5869
5870 rep = unicode_encode_call_errorhandler(
5871 errors, &errorHandler,
5872 encoding, "surrogates not allowed",
5873 str, &exc, pos, pos + 1, &pos);
5874 if (!rep)
5875 goto error;
5876
5877 if (PyBytes_Check(rep)) {
5878 repsize = PyBytes_GET_SIZE(rep);
5879 if (repsize & 1) {
5880 raise_encode_exception(&exc, encoding,
5881 str, pos - 1, pos,
5882 "surrogates not allowed");
5883 goto error;
5884 }
5885 moreunits = repsize / 2;
5886 }
5887 else {
5888 assert(PyUnicode_Check(rep));
5889 if (PyUnicode_READY(rep) < 0)
5890 goto error;
5891 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5892 if (!PyUnicode_IS_ASCII(rep)) {
5893 raise_encode_exception(&exc, encoding,
5894 str, pos - 1, pos,
5895 "surrogates not allowed");
5896 goto error;
5897 }
5898 }
5899
5900 /* two bytes are reserved for each surrogate */
5901 if (moreunits > 1) {
5902 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005903 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005904 /* integer overflow */
5905 PyErr_NoMemory();
5906 goto error;
5907 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005908 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005909 goto error;
5910 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5911 }
5912
5913 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005914 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 out += moreunits;
5916 } else /* rep is unicode */ {
5917 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5918 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5919 &out, native_ordering);
5920 }
5921
5922 Py_CLEAR(rep);
5923 }
5924
5925 /* Cut back to size actually needed. This is necessary for, for example,
5926 encoding of a string containing isolated surrogates and the 'ignore' handler
5927 is used. */
5928 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5929 if (nsize != PyBytes_GET_SIZE(v))
5930 _PyBytes_Resize(&v, nsize);
5931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005933 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 error:
5936 Py_XDECREF(rep);
5937 Py_XDECREF(errorHandler);
5938 Py_XDECREF(exc);
5939 Py_XDECREF(v);
5940 return NULL;
5941#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942}
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5946 Py_ssize_t size,
5947 const char *errors,
5948 int byteorder)
5949{
5950 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005951 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 if (tmp == NULL)
5953 return NULL;
5954 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5955 Py_DECREF(tmp);
5956 return result;
5957}
5958
5959PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005960PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005962 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963}
5964
5965/* --- Unicode Escape Codec ----------------------------------------------- */
5966
Fredrik Lundh06d12682001-01-24 07:59:11 +00005967static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005968
Alexander Belopolsky40018472011-02-26 01:02:56 +00005969PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005970_PyUnicode_DecodeUnicodeEscape(const char *s,
5971 Py_ssize_t size,
5972 const char *errors,
5973 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005976 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978 PyObject *errorHandler = NULL;
5979 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005980
Eric V. Smith42454af2016-10-31 09:22:08 -04005981 // so we can remember if we've seen an invalid escape char or not
5982 *first_invalid_escape = NULL;
5983
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005985 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 }
5987 /* Escaped strings will always be longer than the resulting
5988 Unicode string, so we start with size here and then reduce the
5989 length after conversion to the true value.
5990 (but if the error callback returns a long replacement string
5991 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005992 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 writer.min_length = size;
5994 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5995 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 }
5997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 end = s + size;
5999 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 unsigned char c = (unsigned char) *s++;
6001 Py_UCS4 ch;
6002 int count;
6003 Py_ssize_t startinpos;
6004 Py_ssize_t endinpos;
6005 const char *message;
6006
6007#define WRITE_ASCII_CHAR(ch) \
6008 do { \
6009 assert(ch <= 127); \
6010 assert(writer.pos < writer.size); \
6011 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6012 } while(0)
6013
6014#define WRITE_CHAR(ch) \
6015 do { \
6016 if (ch <= writer.maxchar) { \
6017 assert(writer.pos < writer.size); \
6018 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6019 } \
6020 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6021 goto onError; \
6022 } \
6023 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 if (c != '\\') {
6027 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 continue;
6029 }
6030
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006033 if (s >= end) {
6034 message = "\\ at end of string";
6035 goto error;
6036 }
6037 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006038
Victor Stinner62ec3312016-09-06 17:04:34 -07006039 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006040 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 case '\n': continue;
6044 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6045 case '\'': WRITE_ASCII_CHAR('\''); continue;
6046 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6047 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006048 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006049 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6050 case 't': WRITE_ASCII_CHAR('\t'); continue;
6051 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6052 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006055 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 case '0': case '1': case '2': case '3':
6060 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006062 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006063 ch = (ch<<3) + *s++ - '0';
6064 if (s < end && '0' <= *s && *s <= '7') {
6065 ch = (ch<<3) + *s++ - '0';
6066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 WRITE_CHAR(ch);
6069 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* hex escapes */
6072 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 message = "truncated \\xXX escape";
6076 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 message = "truncated \\uXXXX escape";
6082 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006085 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006087 message = "truncated \\UXXXXXXXX escape";
6088 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006090 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006091 ch <<= 4;
6092 if (c >= '0' && c <= '9') {
6093 ch += c - '0';
6094 }
6095 else if (c >= 'a' && c <= 'f') {
6096 ch += c - ('a' - 10);
6097 }
6098 else if (c >= 'A' && c <= 'F') {
6099 ch += c - ('A' - 10);
6100 }
6101 else {
6102 break;
6103 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006104 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 }
6108
6109 /* when we get here, ch is a 32-bit unicode character */
6110 if (ch > MAX_UNICODE) {
6111 message = "illegal Unicode character";
6112 goto error;
6113 }
6114
6115 WRITE_CHAR(ch);
6116 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006117
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006119 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006120 if (ucnhash_CAPI == NULL) {
6121 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006122 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6123 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006124 if (ucnhash_CAPI == NULL) {
6125 PyErr_SetString(
6126 PyExc_UnicodeError,
6127 "\\N escapes not supported (can't load unicodedata module)"
6128 );
6129 goto onError;
6130 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006131 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006132
6133 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006134 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 const char *start = ++s;
6136 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006137 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006138 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006139 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 namelen = s - start;
6141 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006142 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006143 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 ch = 0xffffffff; /* in case 'getcode' messes up */
6145 if (namelen <= INT_MAX &&
6146 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6147 &ch, 0)) {
6148 assert(ch <= MAX_UNICODE);
6149 WRITE_CHAR(ch);
6150 continue;
6151 }
6152 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006153 }
6154 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006155 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006156
6157 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006158 if (*first_invalid_escape == NULL) {
6159 *first_invalid_escape = s-1; /* Back up one char, since we've
6160 already incremented s. */
6161 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 WRITE_ASCII_CHAR('\\');
6163 WRITE_CHAR(c);
6164 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006166
6167 error:
6168 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006169 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006170 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006171 errors, &errorHandler,
6172 "unicodeescape", message,
6173 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006174 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006175 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006177 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006178
6179#undef WRITE_ASCII_CHAR
6180#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006182
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006183 Py_XDECREF(errorHandler);
6184 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006185 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006186
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006188 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189 Py_XDECREF(errorHandler);
6190 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 return NULL;
6192}
6193
Eric V. Smith42454af2016-10-31 09:22:08 -04006194PyObject *
6195PyUnicode_DecodeUnicodeEscape(const char *s,
6196 Py_ssize_t size,
6197 const char *errors)
6198{
6199 const char *first_invalid_escape;
6200 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6201 &first_invalid_escape);
6202 if (result == NULL)
6203 return NULL;
6204 if (first_invalid_escape != NULL) {
6205 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6206 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006207 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006208 Py_DECREF(result);
6209 return NULL;
6210 }
6211 }
6212 return result;
6213}
6214
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006215/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216
Alexander Belopolsky40018472011-02-26 01:02:56 +00006217PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006223 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
Ezio Melottie7f90372012-10-05 03:33:31 +03006227 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006228 escape.
6229
Ezio Melottie7f90372012-10-05 03:33:31 +03006230 For UCS1 strings it's '\xxx', 4 bytes per source character.
6231 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6232 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006233 */
6234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235 if (!PyUnicode_Check(unicode)) {
6236 PyErr_BadArgument();
6237 return NULL;
6238 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006240 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 }
Victor Stinner358af132015-10-12 22:36:57 +02006242
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006243 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 if (len == 0) {
6245 return PyBytes_FromStringAndSize(NULL, 0);
6246 }
6247
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006248 kind = PyUnicode_KIND(unicode);
6249 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6251 bytes, and 1 byte characters 4. */
6252 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006253 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 return PyErr_NoMemory();
6255 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006256 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006257 if (repr == NULL) {
6258 return NULL;
6259 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006260
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006262 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006263 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006264
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 /* U+0000-U+00ff range */
6266 if (ch < 0x100) {
6267 if (ch >= ' ' && ch < 127) {
6268 if (ch != '\\') {
6269 /* Copy printable US ASCII as-is */
6270 *p++ = (char) ch;
6271 }
6272 /* Escape backslashes */
6273 else {
6274 *p++ = '\\';
6275 *p++ = '\\';
6276 }
6277 }
Victor Stinner358af132015-10-12 22:36:57 +02006278
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 /* Map special whitespace to '\t', \n', '\r' */
6280 else if (ch == '\t') {
6281 *p++ = '\\';
6282 *p++ = 't';
6283 }
6284 else if (ch == '\n') {
6285 *p++ = '\\';
6286 *p++ = 'n';
6287 }
6288 else if (ch == '\r') {
6289 *p++ = '\\';
6290 *p++ = 'r';
6291 }
6292
6293 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6294 else {
6295 *p++ = '\\';
6296 *p++ = 'x';
6297 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6298 *p++ = Py_hexdigits[ch & 0x000F];
6299 }
Tim Petersced69f82003-09-16 20:30:58 +00006300 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006301 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 *p++ = '\\';
6304 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006305 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6306 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6307 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6308 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6311 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006312
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 /* Make sure that the first two digits are zero */
6314 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006315 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 *p++ = 'U';
6317 *p++ = '0';
6318 *p++ = '0';
6319 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6320 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6321 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6322 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6323 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6324 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
Victor Stinner62ec3312016-09-06 17:04:34 -07006328 assert(p - PyBytes_AS_STRING(repr) > 0);
6329 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6330 return NULL;
6331 }
6332 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333}
6334
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006336PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6337 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006339 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006340 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 }
6344
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006345 result = PyUnicode_AsUnicodeEscapeString(tmp);
6346 Py_DECREF(tmp);
6347 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348}
6349
6350/* --- Raw Unicode Escape Codec ------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006357 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006358 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 PyObject *errorHandler = NULL;
6361 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006362
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006364 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006365 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006366
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 /* Escaped strings will always be longer than the resulting
6368 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 length after conversion to the true value. (But decoding error
6370 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006371 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 writer.min_length = size;
6373 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6374 goto onError;
6375 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 end = s + size;
6378 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 unsigned char c = (unsigned char) *s++;
6380 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006381 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 Py_ssize_t startinpos;
6383 Py_ssize_t endinpos;
6384 const char *message;
6385
6386#define WRITE_CHAR(ch) \
6387 do { \
6388 if (ch <= writer.maxchar) { \
6389 assert(writer.pos < writer.size); \
6390 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6391 } \
6392 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6393 goto onError; \
6394 } \
6395 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 if (c != '\\' || s >= end) {
6399 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006401 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006402
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 c = (unsigned char) *s++;
6404 if (c == 'u') {
6405 count = 4;
6406 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 else if (c == 'U') {
6409 count = 8;
6410 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006411 }
6412 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 assert(writer.pos < writer.size);
6414 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6415 WRITE_CHAR(c);
6416 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006417 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 startinpos = s - starts - 2;
6419
6420 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6421 for (ch = 0; count && s < end; ++s, --count) {
6422 c = (unsigned char)*s;
6423 ch <<= 4;
6424 if (c >= '0' && c <= '9') {
6425 ch += c - '0';
6426 }
6427 else if (c >= 'a' && c <= 'f') {
6428 ch += c - ('a' - 10);
6429 }
6430 else if (c >= 'A' && c <= 'F') {
6431 ch += c - ('A' - 10);
6432 }
6433 else {
6434 break;
6435 }
6436 }
6437 if (!count) {
6438 if (ch <= MAX_UNICODE) {
6439 WRITE_CHAR(ch);
6440 continue;
6441 }
6442 message = "\\Uxxxxxxxx out of range";
6443 }
6444
6445 endinpos = s-starts;
6446 writer.min_length = end - s + writer.pos;
6447 if (unicode_decode_call_errorhandler_writer(
6448 errors, &errorHandler,
6449 "rawunicodeescape", message,
6450 &starts, &end, &startinpos, &endinpos, &exc, &s,
6451 &writer)) {
6452 goto onError;
6453 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006454 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006455
6456#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 Py_XDECREF(errorHandler);
6459 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006461
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006463 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 Py_XDECREF(errorHandler);
6465 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468}
6469
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006470
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006472PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473{
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006477 int kind;
6478 void *data;
6479 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006481 if (!PyUnicode_Check(unicode)) {
6482 PyErr_BadArgument();
6483 return NULL;
6484 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 kind = PyUnicode_KIND(unicode);
6489 data = PyUnicode_DATA(unicode);
6490 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 if (kind == PyUnicode_1BYTE_KIND) {
6492 return PyBytes_FromStringAndSize(data, len);
6493 }
Victor Stinner0e368262011-11-10 20:12:49 +01006494
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6496 bytes, and 1 byte characters 4. */
6497 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 if (len > PY_SSIZE_T_MAX / expandsize) {
6500 return PyErr_NoMemory();
6501 }
6502 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6503 if (repr == NULL) {
6504 return NULL;
6505 }
6506 if (len == 0) {
6507 return repr;
6508 }
6509
6510 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006511 for (pos = 0; pos < len; pos++) {
6512 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006513
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6515 if (ch < 0x100) {
6516 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006517 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006518 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 *p++ = '\\';
6521 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006522 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6523 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6524 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6525 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006527 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6528 else {
6529 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6530 *p++ = '\\';
6531 *p++ = 'U';
6532 *p++ = '0';
6533 *p++ = '0';
6534 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6535 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6536 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6537 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6538 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6539 *p++ = Py_hexdigits[ch & 15];
6540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006542
Victor Stinner62ec3312016-09-06 17:04:34 -07006543 assert(p > PyBytes_AS_STRING(repr));
6544 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6545 return NULL;
6546 }
6547 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548}
6549
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006551PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6552 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006554 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006555 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006556 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006557 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006558 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6559 Py_DECREF(tmp);
6560 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
6563/* --- Latin-1 Codec ------------------------------------------------------ */
6564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565PyObject *
6566PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006567 Py_ssize_t size,
6568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006571 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572}
6573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006575static void
6576make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006577 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006578 PyObject *unicode,
6579 Py_ssize_t startpos, Py_ssize_t endpos,
6580 const char *reason)
6581{
6582 if (*exceptionObject == NULL) {
6583 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006585 encoding, unicode, startpos, endpos, reason);
6586 }
6587 else {
6588 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6589 goto onError;
6590 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6591 goto onError;
6592 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6593 goto onError;
6594 return;
6595 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006596 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006597 }
6598}
6599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601static void
6602raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006604 PyObject *unicode,
6605 Py_ssize_t startpos, Py_ssize_t endpos,
6606 const char *reason)
6607{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006608 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006609 encoding, unicode, startpos, endpos, reason);
6610 if (*exceptionObject != NULL)
6611 PyCodec_StrictErrors(*exceptionObject);
6612}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613
6614/* error handling callback helper:
6615 build arguments, call the callback and check the arguments,
6616 put the result into newpos and return the replacement string, which
6617 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618static PyObject *
6619unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006620 PyObject **errorHandler,
6621 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006623 Py_ssize_t startpos, Py_ssize_t endpos,
6624 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006626 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 PyObject *restuple;
6629 PyObject *resunicode;
6630
6631 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 }
6636
Benjamin Petersonbac79492012-01-14 13:34:47 -05006637 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 return NULL;
6639 len = PyUnicode_GET_LENGTH(unicode);
6640
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006641 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006646 restuple = PyObject_CallFunctionObjArgs(
6647 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006651 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 Py_DECREF(restuple);
6653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006655 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 &resunicode, newpos)) {
6657 Py_DECREF(restuple);
6658 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006660 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6661 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6662 Py_DECREF(restuple);
6663 return NULL;
6664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 *newpos = len + *newpos;
6667 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006668 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 Py_DECREF(restuple);
6670 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 Py_INCREF(resunicode);
6673 Py_DECREF(restuple);
6674 return resunicode;
6675}
6676
Alexander Belopolsky40018472011-02-26 01:02:56 +00006677static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006679 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006680 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 /* input state */
6683 Py_ssize_t pos=0, size;
6684 int kind;
6685 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 /* pointer into the output */
6687 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006688 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6689 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006690 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006692 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006693 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006694 /* output object */
6695 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696
Benjamin Petersonbac79492012-01-14 13:34:47 -05006697 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 return NULL;
6699 size = PyUnicode_GET_LENGTH(unicode);
6700 kind = PyUnicode_KIND(unicode);
6701 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 /* allocate enough for a simple encoding without
6703 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006704 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006705 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006706
6707 _PyBytesWriter_Init(&writer);
6708 str = _PyBytesWriter_Alloc(&writer, size);
6709 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006713 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006718 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006720 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006722 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006725 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006727
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006728 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006730
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006731 /* Only overallocate the buffer if it's not the last write */
6732 writer.overallocate = (collend < size);
6733
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006735 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006736 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006737
6738 switch (error_handler) {
6739 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006740 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006742
6743 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006744 memset(str, '?', collend - collstart);
6745 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006746 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006747 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 break;
Victor Stinner50149202015-09-22 00:26:54 +02006750
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006751 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006752 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006753 writer.min_size -= (collend - collstart);
6754 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006755 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006756 if (str == NULL)
6757 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006758 pos = collend;
6759 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006760
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006761 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006762 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006763 writer.min_size -= (collend - collstart);
6764 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006765 unicode, collstart, collend);
6766 if (str == NULL)
6767 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 break;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnerc3713e92015-09-29 12:32:13 +02006771 case _Py_ERROR_SURROGATEESCAPE:
6772 for (i = collstart; i < collend; ++i) {
6773 ch = PyUnicode_READ(kind, data, i);
6774 if (ch < 0xdc80 || 0xdcff < ch) {
6775 /* Not a UTF-8b surrogate */
6776 break;
6777 }
6778 *str++ = (char)(ch - 0xdc00);
6779 ++pos;
6780 }
6781 if (i >= collend)
6782 break;
6783 collstart = pos;
6784 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006785 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006786
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006788 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6789 encoding, reason, unicode, &exc,
6790 collstart, collend, &newpos);
6791 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006793
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006794 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006795 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006796
Victor Stinner6bd525b2015-10-09 13:10:05 +02006797 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006798 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006799 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006800 PyBytes_AS_STRING(rep),
6801 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006802 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006803 else {
6804 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006805
Victor Stinner6bd525b2015-10-09 13:10:05 +02006806 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006809 if (limit == 256 ?
6810 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6811 !PyUnicode_IS_ASCII(rep))
6812 {
6813 /* Not all characters are smaller than limit */
6814 raise_encode_exception(&exc, encoding, unicode,
6815 collstart, collend, reason);
6816 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006818 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6819 str = _PyBytesWriter_WriteBytes(&writer, str,
6820 PyUnicode_DATA(rep),
6821 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006823 if (str == NULL)
6824 goto onError;
6825
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006826 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006829
6830 /* If overallocation was disabled, ensure that it was the last
6831 write. Otherwise, we missed an optimization */
6832 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006833 }
6834 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006835
Victor Stinner50149202015-09-22 00:26:54 +02006836 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006838 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006839
6840 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006843 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006844 Py_XDECREF(exc);
6845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846}
6847
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849PyObject *
6850PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006851 Py_ssize_t size,
6852 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006854 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006855 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006856 if (unicode == NULL)
6857 return NULL;
6858 result = unicode_encode_ucs1(unicode, errors, 256);
6859 Py_DECREF(unicode);
6860 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861}
6862
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006864_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 PyErr_BadArgument();
6868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006870 if (PyUnicode_READY(unicode) == -1)
6871 return NULL;
6872 /* Fast path: if it is a one-byte string, construct
6873 bytes object directly. */
6874 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6875 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6876 PyUnicode_GET_LENGTH(unicode));
6877 /* Non-Latin-1 characters present. Defer to above function to
6878 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006879 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006880}
6881
6882PyObject*
6883PyUnicode_AsLatin1String(PyObject *unicode)
6884{
6885 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
6888/* --- 7-bit ASCII Codec -------------------------------------------------- */
6889
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890PyObject *
6891PyUnicode_DecodeASCII(const char *s,
6892 Py_ssize_t size,
6893 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006896 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006897 int kind;
6898 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t startinpos;
6900 Py_ssize_t endinpos;
6901 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006903 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006905 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006906
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006908 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006909
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006911 if (size == 1 && (unsigned char)s[0] < 128)
6912 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006913
Victor Stinner8f674cc2013-04-17 23:02:17 +02006914 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006915 writer.min_length = size;
6916 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006917 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006920 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006921 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006922 writer.pos = outpos;
6923 if (writer.pos == size)
6924 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006925
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006926 s += writer.pos;
6927 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006929 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006931 PyUnicode_WRITE(kind, data, writer.pos, c);
6932 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006934 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006936
6937 /* byte outsize range 0x00..0x7f: call the error handler */
6938
6939 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006940 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006941
6942 switch (error_handler)
6943 {
6944 case _Py_ERROR_REPLACE:
6945 case _Py_ERROR_SURROGATEESCAPE:
6946 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006947 but we may switch to UCS2 at the first write */
6948 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6949 goto onError;
6950 kind = writer.kind;
6951 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006952
6953 if (error_handler == _Py_ERROR_REPLACE)
6954 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6955 else
6956 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6957 writer.pos++;
6958 ++s;
6959 break;
6960
6961 case _Py_ERROR_IGNORE:
6962 ++s;
6963 break;
6964
6965 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 startinpos = s-starts;
6967 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006968 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 "ascii", "ordinal not in range(128)",
6971 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006974 kind = writer.kind;
6975 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006980 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006981
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006984 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 return NULL;
6987}
6988
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006989/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006990PyObject *
6991PyUnicode_EncodeASCII(const Py_UNICODE *p,
6992 Py_ssize_t size,
6993 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006995 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006996 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006997 if (unicode == NULL)
6998 return NULL;
6999 result = unicode_encode_ucs1(unicode, errors, 128);
7000 Py_DECREF(unicode);
7001 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002}
7003
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007005_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006{
7007 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 PyErr_BadArgument();
7009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007011 if (PyUnicode_READY(unicode) == -1)
7012 return NULL;
7013 /* Fast path: if it is an ASCII-only string, construct bytes object
7014 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007015 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007016 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7017 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019}
7020
7021PyObject *
7022PyUnicode_AsASCIIString(PyObject *unicode)
7023{
7024 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025}
7026
Steve Dowercc16be82016-09-08 10:35:16 -07007027#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007028
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007029/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007030
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007031#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032#define NEED_RETRY
7033#endif
7034
Victor Stinner3a50e702011-10-18 21:21:00 +02007035#ifndef WC_ERR_INVALID_CHARS
7036# define WC_ERR_INVALID_CHARS 0x0080
7037#endif
7038
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007039static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007040code_page_name(UINT code_page, PyObject **obj)
7041{
7042 *obj = NULL;
7043 if (code_page == CP_ACP)
7044 return "mbcs";
7045 if (code_page == CP_UTF7)
7046 return "CP_UTF7";
7047 if (code_page == CP_UTF8)
7048 return "CP_UTF8";
7049
7050 *obj = PyBytes_FromFormat("cp%u", code_page);
7051 if (*obj == NULL)
7052 return NULL;
7053 return PyBytes_AS_STRING(*obj);
7054}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007055
Victor Stinner3a50e702011-10-18 21:21:00 +02007056static DWORD
7057decode_code_page_flags(UINT code_page)
7058{
7059 if (code_page == CP_UTF7) {
7060 /* The CP_UTF7 decoder only supports flags=0 */
7061 return 0;
7062 }
7063 else
7064 return MB_ERR_INVALID_CHARS;
7065}
7066
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 * Decode a byte string from a Windows code page into unicode object in strict
7069 * mode.
7070 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007071 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7072 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007074static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007075decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007076 wchar_t **buf,
7077 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007078 const char *in,
7079 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080{
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007082 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
7085 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 assert(insize > 0);
7087 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7088 if (outsize <= 0)
7089 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007091 /* Extend a wchar_t* buffer */
7092 Py_ssize_t n = *bufsize; /* Get the current length */
7093 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7094 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007096 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097
7098 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7100 if (outsize <= 0)
7101 goto error;
7102 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007103
Victor Stinner3a50e702011-10-18 21:21:00 +02007104error:
7105 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7106 return -2;
7107 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007108 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109}
7110
Victor Stinner3a50e702011-10-18 21:21:00 +02007111/*
7112 * Decode a byte string from a code page into unicode object with an error
7113 * handler.
7114 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007115 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 * UnicodeDecodeError exception and returns -1 on error.
7117 */
7118static int
7119decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007120 wchar_t **buf,
7121 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007122 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007123 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007124{
7125 const char *startin = in;
7126 const char *endin = in + size;
7127 const DWORD flags = decode_code_page_flags(code_page);
7128 /* Ideally, we should get reason from FormatMessage. This is the Windows
7129 2000 English version of the message. */
7130 const char *reason = "No mapping for the Unicode character exists "
7131 "in the target code page.";
7132 /* each step cannot decode more than 1 character, but a character can be
7133 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007134 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007135 int insize;
7136 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 PyObject *errorHandler = NULL;
7138 PyObject *exc = NULL;
7139 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007140 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 DWORD err;
7142 int ret = -1;
7143
7144 assert(size > 0);
7145
7146 encoding = code_page_name(code_page, &encoding_obj);
7147 if (encoding == NULL)
7148 return -1;
7149
Victor Stinner7d00cc12014-03-17 23:08:06 +01007150 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7152 UnicodeDecodeError. */
7153 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7154 if (exc != NULL) {
7155 PyCodec_StrictErrors(exc);
7156 Py_CLEAR(exc);
7157 }
7158 goto error;
7159 }
7160
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007161 /* Extend a wchar_t* buffer */
7162 Py_ssize_t n = *bufsize; /* Get the current length */
7163 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7164 PyErr_NoMemory();
7165 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007167 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7168 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007170 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171
7172 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 while (in < endin)
7174 {
7175 /* Decode a character */
7176 insize = 1;
7177 do
7178 {
7179 outsize = MultiByteToWideChar(code_page, flags,
7180 in, insize,
7181 buffer, Py_ARRAY_LENGTH(buffer));
7182 if (outsize > 0)
7183 break;
7184 err = GetLastError();
7185 if (err != ERROR_NO_UNICODE_TRANSLATION
7186 && err != ERROR_INSUFFICIENT_BUFFER)
7187 {
7188 PyErr_SetFromWindowsErr(0);
7189 goto error;
7190 }
7191 insize++;
7192 }
7193 /* 4=maximum length of a UTF-8 sequence */
7194 while (insize <= 4 && (in + insize) <= endin);
7195
7196 if (outsize <= 0) {
7197 Py_ssize_t startinpos, endinpos, outpos;
7198
Victor Stinner7d00cc12014-03-17 23:08:06 +01007199 /* last character in partial decode? */
7200 if (in + insize >= endin && !final)
7201 break;
7202
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 startinpos = in - startin;
7204 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007205 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007206 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 errors, &errorHandler,
7208 encoding, reason,
7209 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007210 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 {
7212 goto error;
7213 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007214 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 }
7216 else {
7217 in += insize;
7218 memcpy(out, buffer, outsize * sizeof(wchar_t));
7219 out += outsize;
7220 }
7221 }
7222
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007223 /* Shrink the buffer */
7224 assert(out - *buf <= *bufsize);
7225 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007226 /* (in - startin) <= size and size is an int */
7227 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007228
7229error:
7230 Py_XDECREF(encoding_obj);
7231 Py_XDECREF(errorHandler);
7232 Py_XDECREF(exc);
7233 return ret;
7234}
7235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236static PyObject *
7237decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007238 const char *s, Py_ssize_t size,
7239 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007241 wchar_t *buf = NULL;
7242 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007243 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 if (code_page < 0) {
7246 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7247 return NULL;
7248 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007249 if (size < 0) {
7250 PyErr_BadInternalCall();
7251 return NULL;
7252 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007253
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256
Victor Stinner76a31a62011-11-04 00:05:13 +01007257 do
7258 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007259#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007260 if (size > INT_MAX) {
7261 chunk_size = INT_MAX;
7262 final = 0;
7263 done = 0;
7264 }
7265 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007267 {
7268 chunk_size = (int)size;
7269 final = (consumed == NULL);
7270 done = 1;
7271 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007274 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007276 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007277 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007279 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007280 s, chunk_size);
7281 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007282 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007284 errors, final);
7285 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007286
7287 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007288 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007289 return NULL;
7290 }
7291
7292 if (consumed)
7293 *consumed += converted;
7294
7295 s += converted;
7296 size -= converted;
7297 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007298
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007299 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7300 PyMem_Free(buf);
7301 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302}
7303
Alexander Belopolsky40018472011-02-26 01:02:56 +00007304PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007305PyUnicode_DecodeCodePageStateful(int code_page,
7306 const char *s,
7307 Py_ssize_t size,
7308 const char *errors,
7309 Py_ssize_t *consumed)
7310{
7311 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7312}
7313
7314PyObject *
7315PyUnicode_DecodeMBCSStateful(const char *s,
7316 Py_ssize_t size,
7317 const char *errors,
7318 Py_ssize_t *consumed)
7319{
7320 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7321}
7322
7323PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324PyUnicode_DecodeMBCS(const char *s,
7325 Py_ssize_t size,
7326 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007327{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7329}
7330
Victor Stinner3a50e702011-10-18 21:21:00 +02007331static DWORD
7332encode_code_page_flags(UINT code_page, const char *errors)
7333{
7334 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007335 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 }
7337 else if (code_page == CP_UTF7) {
7338 /* CP_UTF7 only supports flags=0 */
7339 return 0;
7340 }
7341 else {
7342 if (errors != NULL && strcmp(errors, "replace") == 0)
7343 return 0;
7344 else
7345 return WC_NO_BEST_FIT_CHARS;
7346 }
7347}
7348
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 * Encode a Unicode string to a Windows code page into a byte string in strict
7351 * mode.
7352 *
7353 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007354 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007355 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007356static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007357encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360{
Victor Stinner554f3f02010-06-16 23:33:54 +00007361 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 BOOL *pusedDefaultChar = &usedDefaultChar;
7363 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007364 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 const DWORD flags = encode_code_page_flags(code_page, NULL);
7367 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007368 /* Create a substring so that we can get the UTF-16 representation
7369 of just the slice under consideration. */
7370 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007373
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007375 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007377 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007378
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379 substring = PyUnicode_Substring(unicode, offset, offset+len);
7380 if (substring == NULL)
7381 return -1;
7382 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7383 if (p == NULL) {
7384 Py_DECREF(substring);
7385 return -1;
7386 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007387 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007389 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007391 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 NULL, 0,
7393 NULL, pusedDefaultChar);
7394 if (outsize <= 0)
7395 goto error;
7396 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 if (pusedDefaultChar && *pusedDefaultChar) {
7398 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007400 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007401
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 if (*outbytes == NULL) {
7406 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410 }
7411 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 const Py_ssize_t n = PyBytes_Size(*outbytes);
7414 if (outsize > PY_SSIZE_T_MAX - n) {
7415 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7420 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 }
7425
7426 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007428 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 out, outsize,
7430 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (outsize <= 0)
7433 goto error;
7434 if (pusedDefaultChar && *pusedDefaultChar)
7435 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007437
Victor Stinner3a50e702011-10-18 21:21:00 +02007438error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007439 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7441 return -2;
7442 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007443 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007444}
7445
Victor Stinner3a50e702011-10-18 21:21:00 +02007446/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007447 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 * error handler.
7449 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007450 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 * -1 on other error.
7452 */
7453static int
7454encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007456 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007457{
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 Py_ssize_t pos = unicode_offset;
7460 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 /* Ideally, we should get reason from FormatMessage. This is the Windows
7462 2000 English version of the message. */
7463 const char *reason = "invalid character";
7464 /* 4=maximum length of a UTF-8 sequence */
7465 char buffer[4];
7466 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7467 Py_ssize_t outsize;
7468 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 PyObject *errorHandler = NULL;
7470 PyObject *exc = NULL;
7471 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007472 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007473 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 PyObject *rep;
7475 int ret = -1;
7476
7477 assert(insize > 0);
7478
7479 encoding = code_page_name(code_page, &encoding_obj);
7480 if (encoding == NULL)
7481 return -1;
7482
7483 if (errors == NULL || strcmp(errors, "strict") == 0) {
7484 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7485 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007486 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 if (exc != NULL) {
7488 PyCodec_StrictErrors(exc);
7489 Py_DECREF(exc);
7490 }
7491 Py_XDECREF(encoding_obj);
7492 return -1;
7493 }
7494
7495 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7496 pusedDefaultChar = &usedDefaultChar;
7497 else
7498 pusedDefaultChar = NULL;
7499
7500 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7501 PyErr_NoMemory();
7502 goto error;
7503 }
7504 outsize = insize * Py_ARRAY_LENGTH(buffer);
7505
7506 if (*outbytes == NULL) {
7507 /* Create string object */
7508 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7509 if (*outbytes == NULL)
7510 goto error;
7511 out = PyBytes_AS_STRING(*outbytes);
7512 }
7513 else {
7514 /* Extend string object */
7515 Py_ssize_t n = PyBytes_Size(*outbytes);
7516 if (n > PY_SSIZE_T_MAX - outsize) {
7517 PyErr_NoMemory();
7518 goto error;
7519 }
7520 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7521 goto error;
7522 out = PyBytes_AS_STRING(*outbytes) + n;
7523 }
7524
7525 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007526 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007528 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7529 wchar_t chars[2];
7530 int charsize;
7531 if (ch < 0x10000) {
7532 chars[0] = (wchar_t)ch;
7533 charsize = 1;
7534 }
7535 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007536 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7537 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007538 charsize = 2;
7539 }
7540
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 buffer, Py_ARRAY_LENGTH(buffer),
7544 NULL, pusedDefaultChar);
7545 if (outsize > 0) {
7546 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7547 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007548 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 memcpy(out, buffer, outsize);
7550 out += outsize;
7551 continue;
7552 }
7553 }
7554 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7555 PyErr_SetFromWindowsErr(0);
7556 goto error;
7557 }
7558
Victor Stinner3a50e702011-10-18 21:21:00 +02007559 rep = unicode_encode_call_errorhandler(
7560 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007561 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007562 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 if (rep == NULL)
7564 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007565 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566
7567 if (PyBytes_Check(rep)) {
7568 outsize = PyBytes_GET_SIZE(rep);
7569 if (outsize != 1) {
7570 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7571 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7572 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7573 Py_DECREF(rep);
7574 goto error;
7575 }
7576 out = PyBytes_AS_STRING(*outbytes) + offset;
7577 }
7578 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7579 out += outsize;
7580 }
7581 else {
7582 Py_ssize_t i;
7583 enum PyUnicode_Kind kind;
7584 void *data;
7585
Benjamin Petersonbac79492012-01-14 13:34:47 -05007586 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 Py_DECREF(rep);
7588 goto error;
7589 }
7590
7591 outsize = PyUnicode_GET_LENGTH(rep);
7592 if (outsize != 1) {
7593 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7594 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7595 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7596 Py_DECREF(rep);
7597 goto error;
7598 }
7599 out = PyBytes_AS_STRING(*outbytes) + offset;
7600 }
7601 kind = PyUnicode_KIND(rep);
7602 data = PyUnicode_DATA(rep);
7603 for (i=0; i < outsize; i++) {
7604 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7605 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007606 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607 encoding, unicode,
7608 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 "unable to encode error handler result to ASCII");
7610 Py_DECREF(rep);
7611 goto error;
7612 }
7613 *out = (unsigned char)ch;
7614 out++;
7615 }
7616 }
7617 Py_DECREF(rep);
7618 }
7619 /* write a NUL byte */
7620 *out = 0;
7621 outsize = out - PyBytes_AS_STRING(*outbytes);
7622 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7623 if (_PyBytes_Resize(outbytes, outsize) < 0)
7624 goto error;
7625 ret = 0;
7626
7627error:
7628 Py_XDECREF(encoding_obj);
7629 Py_XDECREF(errorHandler);
7630 Py_XDECREF(exc);
7631 return ret;
7632}
7633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634static PyObject *
7635encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007636 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 const char *errors)
7638{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007641 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007643
Victor Stinner29dacf22015-01-26 16:41:32 +01007644 if (!PyUnicode_Check(unicode)) {
7645 PyErr_BadArgument();
7646 return NULL;
7647 }
7648
Benjamin Petersonbac79492012-01-14 13:34:47 -05007649 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007650 return NULL;
7651 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007652
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 if (code_page < 0) {
7654 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7655 return NULL;
7656 }
7657
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 return PyBytes_FromStringAndSize(NULL, 0);
7660
Victor Stinner7581cef2011-11-03 22:32:33 +01007661 offset = 0;
7662 do
7663 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007664#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007665 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007666 chunks. */
7667 if (len > INT_MAX/2) {
7668 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 done = 0;
7670 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007671 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007672#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007675 done = 1;
7676 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007677
Victor Stinner76a31a62011-11-04 00:05:13 +01007678 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 errors);
7681 if (ret == -2)
7682 ret = encode_code_page_errors(code_page, &outbytes,
7683 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007684 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007685 if (ret < 0) {
7686 Py_XDECREF(outbytes);
7687 return NULL;
7688 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689
Victor Stinner7581cef2011-11-03 22:32:33 +01007690 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007692 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 return outbytes;
7695}
7696
7697PyObject *
7698PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7699 Py_ssize_t size,
7700 const char *errors)
7701{
Victor Stinner7581cef2011-11-03 22:32:33 +01007702 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007703 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007704 if (unicode == NULL)
7705 return NULL;
7706 res = encode_code_page(CP_ACP, unicode, errors);
7707 Py_DECREF(unicode);
7708 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007709}
7710
7711PyObject *
7712PyUnicode_EncodeCodePage(int code_page,
7713 PyObject *unicode,
7714 const char *errors)
7715{
Victor Stinner7581cef2011-11-03 22:32:33 +01007716 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007717}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007718
Alexander Belopolsky40018472011-02-26 01:02:56 +00007719PyObject *
7720PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007721{
Victor Stinner7581cef2011-11-03 22:32:33 +01007722 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007723}
7724
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007725#undef NEED_RETRY
7726
Steve Dowercc16be82016-09-08 10:35:16 -07007727#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729/* --- Character Mapping Codec -------------------------------------------- */
7730
Victor Stinnerfb161b12013-04-18 01:44:27 +02007731static int
7732charmap_decode_string(const char *s,
7733 Py_ssize_t size,
7734 PyObject *mapping,
7735 const char *errors,
7736 _PyUnicodeWriter *writer)
7737{
7738 const char *starts = s;
7739 const char *e;
7740 Py_ssize_t startinpos, endinpos;
7741 PyObject *errorHandler = NULL, *exc = NULL;
7742 Py_ssize_t maplen;
7743 enum PyUnicode_Kind mapkind;
7744 void *mapdata;
7745 Py_UCS4 x;
7746 unsigned char ch;
7747
7748 if (PyUnicode_READY(mapping) == -1)
7749 return -1;
7750
7751 maplen = PyUnicode_GET_LENGTH(mapping);
7752 mapdata = PyUnicode_DATA(mapping);
7753 mapkind = PyUnicode_KIND(mapping);
7754
7755 e = s + size;
7756
7757 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7758 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7759 * is disabled in encoding aliases, latin1 is preferred because
7760 * its implementation is faster. */
7761 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7762 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7763 Py_UCS4 maxchar = writer->maxchar;
7764
7765 assert (writer->kind == PyUnicode_1BYTE_KIND);
7766 while (s < e) {
7767 ch = *s;
7768 x = mapdata_ucs1[ch];
7769 if (x > maxchar) {
7770 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7771 goto onError;
7772 maxchar = writer->maxchar;
7773 outdata = (Py_UCS1 *)writer->data;
7774 }
7775 outdata[writer->pos] = x;
7776 writer->pos++;
7777 ++s;
7778 }
7779 return 0;
7780 }
7781
7782 while (s < e) {
7783 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7784 enum PyUnicode_Kind outkind = writer->kind;
7785 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7786 if (outkind == PyUnicode_1BYTE_KIND) {
7787 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7788 Py_UCS4 maxchar = writer->maxchar;
7789 while (s < e) {
7790 ch = *s;
7791 x = mapdata_ucs2[ch];
7792 if (x > maxchar)
7793 goto Error;
7794 outdata[writer->pos] = x;
7795 writer->pos++;
7796 ++s;
7797 }
7798 break;
7799 }
7800 else if (outkind == PyUnicode_2BYTE_KIND) {
7801 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7802 while (s < e) {
7803 ch = *s;
7804 x = mapdata_ucs2[ch];
7805 if (x == 0xFFFE)
7806 goto Error;
7807 outdata[writer->pos] = x;
7808 writer->pos++;
7809 ++s;
7810 }
7811 break;
7812 }
7813 }
7814 ch = *s;
7815
7816 if (ch < maplen)
7817 x = PyUnicode_READ(mapkind, mapdata, ch);
7818 else
7819 x = 0xfffe; /* invalid value */
7820Error:
7821 if (x == 0xfffe)
7822 {
7823 /* undefined mapping */
7824 startinpos = s-starts;
7825 endinpos = startinpos+1;
7826 if (unicode_decode_call_errorhandler_writer(
7827 errors, &errorHandler,
7828 "charmap", "character maps to <undefined>",
7829 &starts, &e, &startinpos, &endinpos, &exc, &s,
7830 writer)) {
7831 goto onError;
7832 }
7833 continue;
7834 }
7835
7836 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7837 goto onError;
7838 ++s;
7839 }
7840 Py_XDECREF(errorHandler);
7841 Py_XDECREF(exc);
7842 return 0;
7843
7844onError:
7845 Py_XDECREF(errorHandler);
7846 Py_XDECREF(exc);
7847 return -1;
7848}
7849
7850static int
7851charmap_decode_mapping(const char *s,
7852 Py_ssize_t size,
7853 PyObject *mapping,
7854 const char *errors,
7855 _PyUnicodeWriter *writer)
7856{
7857 const char *starts = s;
7858 const char *e;
7859 Py_ssize_t startinpos, endinpos;
7860 PyObject *errorHandler = NULL, *exc = NULL;
7861 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007862 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007863
7864 e = s + size;
7865
7866 while (s < e) {
7867 ch = *s;
7868
7869 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7870 key = PyLong_FromLong((long)ch);
7871 if (key == NULL)
7872 goto onError;
7873
7874 item = PyObject_GetItem(mapping, key);
7875 Py_DECREF(key);
7876 if (item == NULL) {
7877 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7878 /* No mapping found means: mapping is undefined. */
7879 PyErr_Clear();
7880 goto Undefined;
7881 } else
7882 goto onError;
7883 }
7884
7885 /* Apply mapping */
7886 if (item == Py_None)
7887 goto Undefined;
7888 if (PyLong_Check(item)) {
7889 long value = PyLong_AS_LONG(item);
7890 if (value == 0xFFFE)
7891 goto Undefined;
7892 if (value < 0 || value > MAX_UNICODE) {
7893 PyErr_Format(PyExc_TypeError,
7894 "character mapping must be in range(0x%lx)",
7895 (unsigned long)MAX_UNICODE + 1);
7896 goto onError;
7897 }
7898
7899 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7900 goto onError;
7901 }
7902 else if (PyUnicode_Check(item)) {
7903 if (PyUnicode_READY(item) == -1)
7904 goto onError;
7905 if (PyUnicode_GET_LENGTH(item) == 1) {
7906 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7907 if (value == 0xFFFE)
7908 goto Undefined;
7909 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7910 goto onError;
7911 }
7912 else {
7913 writer->overallocate = 1;
7914 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7915 goto onError;
7916 }
7917 }
7918 else {
7919 /* wrong return value */
7920 PyErr_SetString(PyExc_TypeError,
7921 "character mapping must return integer, None or str");
7922 goto onError;
7923 }
7924 Py_CLEAR(item);
7925 ++s;
7926 continue;
7927
7928Undefined:
7929 /* undefined mapping */
7930 Py_CLEAR(item);
7931 startinpos = s-starts;
7932 endinpos = startinpos+1;
7933 if (unicode_decode_call_errorhandler_writer(
7934 errors, &errorHandler,
7935 "charmap", "character maps to <undefined>",
7936 &starts, &e, &startinpos, &endinpos, &exc, &s,
7937 writer)) {
7938 goto onError;
7939 }
7940 }
7941 Py_XDECREF(errorHandler);
7942 Py_XDECREF(exc);
7943 return 0;
7944
7945onError:
7946 Py_XDECREF(item);
7947 Py_XDECREF(errorHandler);
7948 Py_XDECREF(exc);
7949 return -1;
7950}
7951
Alexander Belopolsky40018472011-02-26 01:02:56 +00007952PyObject *
7953PyUnicode_DecodeCharmap(const char *s,
7954 Py_ssize_t size,
7955 PyObject *mapping,
7956 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007958 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007959
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 /* Default to Latin-1 */
7961 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007965 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007966 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007967 writer.min_length = size;
7968 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007970
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007971 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007972 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7973 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007974 }
7975 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007976 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007979 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007980
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007982 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 return NULL;
7984}
7985
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986/* Charmap encoding: the lookup table */
7987
Alexander Belopolsky40018472011-02-26 01:02:56 +00007988struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 PyObject_HEAD
7990 unsigned char level1[32];
7991 int count2, count3;
7992 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993};
7994
7995static PyObject*
7996encoding_map_size(PyObject *obj, PyObject* args)
7997{
7998 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001}
8002
8003static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 PyDoc_STR("Return the size (in bytes) of this object") },
8006 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007};
8008
8009static void
8010encoding_map_dealloc(PyObject* o)
8011{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013}
8014
8015static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 "EncodingMap", /*tp_name*/
8018 sizeof(struct encoding_map), /*tp_basicsize*/
8019 0, /*tp_itemsize*/
8020 /* methods */
8021 encoding_map_dealloc, /*tp_dealloc*/
8022 0, /*tp_print*/
8023 0, /*tp_getattr*/
8024 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008025 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 0, /*tp_repr*/
8027 0, /*tp_as_number*/
8028 0, /*tp_as_sequence*/
8029 0, /*tp_as_mapping*/
8030 0, /*tp_hash*/
8031 0, /*tp_call*/
8032 0, /*tp_str*/
8033 0, /*tp_getattro*/
8034 0, /*tp_setattro*/
8035 0, /*tp_as_buffer*/
8036 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8037 0, /*tp_doc*/
8038 0, /*tp_traverse*/
8039 0, /*tp_clear*/
8040 0, /*tp_richcompare*/
8041 0, /*tp_weaklistoffset*/
8042 0, /*tp_iter*/
8043 0, /*tp_iternext*/
8044 encoding_map_methods, /*tp_methods*/
8045 0, /*tp_members*/
8046 0, /*tp_getset*/
8047 0, /*tp_base*/
8048 0, /*tp_dict*/
8049 0, /*tp_descr_get*/
8050 0, /*tp_descr_set*/
8051 0, /*tp_dictoffset*/
8052 0, /*tp_init*/
8053 0, /*tp_alloc*/
8054 0, /*tp_new*/
8055 0, /*tp_free*/
8056 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057};
8058
8059PyObject*
8060PyUnicode_BuildEncodingMap(PyObject* string)
8061{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 PyObject *result;
8063 struct encoding_map *mresult;
8064 int i;
8065 int need_dict = 0;
8066 unsigned char level1[32];
8067 unsigned char level2[512];
8068 unsigned char *mlevel1, *mlevel2, *mlevel3;
8069 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 int kind;
8071 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008072 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008073 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008075 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 PyErr_BadArgument();
8077 return NULL;
8078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 kind = PyUnicode_KIND(string);
8080 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008081 length = PyUnicode_GET_LENGTH(string);
8082 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 memset(level1, 0xFF, sizeof level1);
8084 memset(level2, 0xFF, sizeof level2);
8085
8086 /* If there isn't a one-to-one mapping of NULL to \0,
8087 or if there are non-BMP characters, we need to use
8088 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008091 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008092 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 ch = PyUnicode_READ(kind, data, i);
8094 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095 need_dict = 1;
8096 break;
8097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 /* unmapped character */
8100 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 l1 = ch >> 11;
8102 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 if (level1[l1] == 0xFF)
8104 level1[l1] = count2++;
8105 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 }
8108
8109 if (count2 >= 0xFF || count3 >= 0xFF)
8110 need_dict = 1;
8111
8112 if (need_dict) {
8113 PyObject *result = PyDict_New();
8114 PyObject *key, *value;
8115 if (!result)
8116 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008117 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008119 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 if (!key || !value)
8121 goto failed1;
8122 if (PyDict_SetItem(result, key, value) == -1)
8123 goto failed1;
8124 Py_DECREF(key);
8125 Py_DECREF(value);
8126 }
8127 return result;
8128 failed1:
8129 Py_XDECREF(key);
8130 Py_XDECREF(value);
8131 Py_DECREF(result);
8132 return NULL;
8133 }
8134
8135 /* Create a three-level trie */
8136 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8137 16*count2 + 128*count3 - 1);
8138 if (!result)
8139 return PyErr_NoMemory();
8140 PyObject_Init(result, &EncodingMapType);
8141 mresult = (struct encoding_map*)result;
8142 mresult->count2 = count2;
8143 mresult->count3 = count3;
8144 mlevel1 = mresult->level1;
8145 mlevel2 = mresult->level23;
8146 mlevel3 = mresult->level23 + 16*count2;
8147 memcpy(mlevel1, level1, 32);
8148 memset(mlevel2, 0xFF, 16*count2);
8149 memset(mlevel3, 0, 128*count3);
8150 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008151 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008153 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8154 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 /* unmapped character */
8156 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008157 o1 = ch>>11;
8158 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 i2 = 16*mlevel1[o1] + o2;
8160 if (mlevel2[i2] == 0xFF)
8161 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008162 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 i3 = 128*mlevel2[i2] + o3;
8164 mlevel3[i3] = i;
8165 }
8166 return result;
8167}
8168
8169static int
Victor Stinner22168992011-11-20 17:09:18 +01008170encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171{
8172 struct encoding_map *map = (struct encoding_map*)mapping;
8173 int l1 = c>>11;
8174 int l2 = (c>>7) & 0xF;
8175 int l3 = c & 0x7F;
8176 int i;
8177
Victor Stinner22168992011-11-20 17:09:18 +01008178 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 if (c == 0)
8181 return 0;
8182 /* level 1*/
8183 i = map->level1[l1];
8184 if (i == 0xFF) {
8185 return -1;
8186 }
8187 /* level 2*/
8188 i = map->level23[16*i+l2];
8189 if (i == 0xFF) {
8190 return -1;
8191 }
8192 /* level 3 */
8193 i = map->level23[16*map->count2 + 128*i + l3];
8194 if (i == 0) {
8195 return -1;
8196 }
8197 return i;
8198}
8199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200/* Lookup the character ch in the mapping. If the character
8201 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008202 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008203static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008204charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205{
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 PyObject *x;
8208
8209 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 x = PyObject_GetItem(mapping, w);
8212 Py_DECREF(w);
8213 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8215 /* No mapping found means: mapping is undefined. */
8216 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008217 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 } else
8219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008221 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008223 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 long value = PyLong_AS_LONG(x);
8225 if (value < 0 || value > 255) {
8226 PyErr_SetString(PyExc_TypeError,
8227 "character mapping must be in range(256)");
8228 Py_DECREF(x);
8229 return NULL;
8230 }
8231 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008233 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 /* wrong return value */
8237 PyErr_Format(PyExc_TypeError,
8238 "character mapping must return integer, bytes or None, not %.400s",
8239 x->ob_type->tp_name);
8240 Py_DECREF(x);
8241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 }
8243}
8244
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008246charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008247{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8249 /* exponentially overallocate to minimize reallocations */
8250 if (requiredsize < 2*outsize)
8251 requiredsize = 2*outsize;
8252 if (_PyBytes_Resize(outobj, requiredsize))
8253 return -1;
8254 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255}
8256
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008259} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008261 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 space is available. Return a new reference to the object that
8263 was put in the output buffer, or Py_None, if the mapping was undefined
8264 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008265 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008267charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270 PyObject *rep;
8271 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008272 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273
Christian Heimes90aa7642007-12-19 02:45:37 +00008274 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008275 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 if (res == -1)
8278 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 if (outsize<requiredsize)
8280 if (charmapencode_resize(outobj, outpos, requiredsize))
8281 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008282 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 outstart[(*outpos)++] = (char)res;
8284 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285 }
8286
8287 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 Py_DECREF(rep);
8292 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008293 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 if (PyLong_Check(rep)) {
8295 Py_ssize_t requiredsize = *outpos+1;
8296 if (outsize<requiredsize)
8297 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8298 Py_DECREF(rep);
8299 return enc_EXCEPTION;
8300 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008301 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 else {
8305 const char *repchars = PyBytes_AS_STRING(rep);
8306 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8307 Py_ssize_t requiredsize = *outpos+repsize;
8308 if (outsize<requiredsize)
8309 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8310 Py_DECREF(rep);
8311 return enc_EXCEPTION;
8312 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008313 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 memcpy(outstart + *outpos, repchars, repsize);
8315 *outpos += repsize;
8316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 Py_DECREF(rep);
8319 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320}
8321
8322/* handle an error in PyUnicode_EncodeCharmap
8323 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324static int
8325charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008326 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008328 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008329 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330{
8331 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008332 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008333 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008334 enum PyUnicode_Kind kind;
8335 void *data;
8336 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008338 Py_ssize_t collstartpos = *inpos;
8339 Py_ssize_t collendpos = *inpos+1;
8340 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008341 const char *encoding = "charmap";
8342 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008344 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008345 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346
Benjamin Petersonbac79492012-01-14 13:34:47 -05008347 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008348 return -1;
8349 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 /* find all unencodable characters */
8351 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008353 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008355 val = encoding_map_lookup(ch, mapping);
8356 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 break;
8358 ++collendpos;
8359 continue;
8360 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008362 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8363 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 if (rep==NULL)
8365 return -1;
8366 else if (rep!=Py_None) {
8367 Py_DECREF(rep);
8368 break;
8369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008370 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 }
8373 /* cache callback name lookup
8374 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008375 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008376 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008377
8378 switch (*error_handler) {
8379 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008380 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008381 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008382
8383 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 x = charmapencode_output('?', mapping, res, respos);
8386 if (x==enc_EXCEPTION) {
8387 return -1;
8388 }
8389 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008390 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return -1;
8392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008393 }
8394 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008395 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 *inpos = collendpos;
8397 break;
Victor Stinner50149202015-09-22 00:26:54 +02008398
8399 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008400 /* generate replacement (temporarily (mis)uses p) */
8401 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 char buffer[2+29+1+1];
8403 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 for (cp = buffer; *cp; ++cp) {
8406 x = charmapencode_output(*cp, mapping, res, respos);
8407 if (x==enc_EXCEPTION)
8408 return -1;
8409 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008410 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return -1;
8412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 }
8414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 *inpos = collendpos;
8416 break;
Victor Stinner50149202015-09-22 00:26:54 +02008417
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 default:
Victor Stinner50149202015-09-22 00:26:54 +02008419 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008424 if (PyBytes_Check(repunicode)) {
8425 /* Directly copy bytes result to output. */
8426 Py_ssize_t outsize = PyBytes_Size(*res);
8427 Py_ssize_t requiredsize;
8428 repsize = PyBytes_Size(repunicode);
8429 requiredsize = *respos + repsize;
8430 if (requiredsize > outsize)
8431 /* Make room for all additional bytes. */
8432 if (charmapencode_resize(res, respos, requiredsize)) {
8433 Py_DECREF(repunicode);
8434 return -1;
8435 }
8436 memcpy(PyBytes_AsString(*res) + *respos,
8437 PyBytes_AsString(repunicode), repsize);
8438 *respos += repsize;
8439 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008440 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008441 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008444 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008445 Py_DECREF(repunicode);
8446 return -1;
8447 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008448 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008449 data = PyUnicode_DATA(repunicode);
8450 kind = PyUnicode_KIND(repunicode);
8451 for (index = 0; index < repsize; index++) {
8452 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8453 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008455 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
8457 }
8458 else if (x==enc_FAILED) {
8459 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008460 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 }
8464 *inpos = newpos;
8465 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 }
8467 return 0;
8468}
8469
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008471_PyUnicode_EncodeCharmap(PyObject *unicode,
8472 PyObject *mapping,
8473 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 /* output object */
8476 PyObject *res = NULL;
8477 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008481 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008482 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008484 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008485 void *data;
8486 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487
Benjamin Petersonbac79492012-01-14 13:34:47 -05008488 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008489 return NULL;
8490 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008491 data = PyUnicode_DATA(unicode);
8492 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008493
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 /* Default to Latin-1 */
8495 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008496 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 /* allocate enough for a simple encoding without
8499 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008500 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 if (res == NULL)
8502 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008503 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008507 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008509 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 if (x==enc_EXCEPTION) /* error */
8511 goto onError;
8512 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008515 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 &res, &respos)) {
8517 goto onError;
8518 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 else
8521 /* done with this character => adjust input position */
8522 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008526 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008527 if (_PyBytes_Resize(&res, respos) < 0)
8528 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008531 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 return res;
8533
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 Py_XDECREF(res);
8536 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008537 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 return NULL;
8539}
8540
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541/* Deprecated */
8542PyObject *
8543PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8544 Py_ssize_t size,
8545 PyObject *mapping,
8546 const char *errors)
8547{
8548 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008549 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 if (unicode == NULL)
8551 return NULL;
8552 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8553 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008554 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555}
8556
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557PyObject *
8558PyUnicode_AsCharmapString(PyObject *unicode,
8559 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560{
8561 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 PyErr_BadArgument();
8563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566}
8567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008569static void
8570make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008572 Py_ssize_t startpos, Py_ssize_t endpos,
8573 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 *exceptionObject = _PyUnicodeTranslateError_Create(
8577 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
8579 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8581 goto onError;
8582 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8583 goto onError;
8584 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8585 goto onError;
8586 return;
8587 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008588 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 }
8590}
8591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592/* error handling callback helper:
8593 build arguments, call the callback and check the arguments,
8594 put the result into newpos and return the replacement string, which
8595 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008596static PyObject *
8597unicode_translate_call_errorhandler(const char *errors,
8598 PyObject **errorHandler,
8599 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601 Py_ssize_t startpos, Py_ssize_t endpos,
8602 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008604 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008606 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 PyObject *restuple;
8608 PyObject *resunicode;
8609
8610 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 }
8615
8616 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008621 restuple = PyObject_CallFunctionObjArgs(
8622 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008626 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 Py_DECREF(restuple);
8628 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008630 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 &resunicode, &i_newpos)) {
8632 Py_DECREF(restuple);
8633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008635 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008637 else
8638 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008640 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 Py_DECREF(restuple);
8642 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 Py_INCREF(resunicode);
8645 Py_DECREF(restuple);
8646 return resunicode;
8647}
8648
8649/* Lookup the character ch in the mapping and put the result in result,
8650 which must be decrefed by the caller.
8651 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008652static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654{
Christian Heimes217cfd12007-12-02 14:31:20 +00008655 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 PyObject *x;
8657
8658 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660 x = PyObject_GetItem(mapping, w);
8661 Py_DECREF(w);
8662 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8664 /* No mapping found means: use 1:1 mapping. */
8665 PyErr_Clear();
8666 *result = NULL;
8667 return 0;
8668 } else
8669 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 }
8671 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 *result = x;
8673 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008675 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008677 if (value < 0 || value > MAX_UNICODE) {
8678 PyErr_Format(PyExc_ValueError,
8679 "character mapping must be in range(0x%x)",
8680 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(x);
8682 return -1;
8683 }
8684 *result = x;
8685 return 0;
8686 }
8687 else if (PyUnicode_Check(x)) {
8688 *result = x;
8689 return 0;
8690 }
8691 else {
8692 /* wrong return value */
8693 PyErr_SetString(PyExc_TypeError,
8694 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 Py_DECREF(x);
8696 return -1;
8697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698}
Victor Stinner1194ea02014-04-04 19:37:40 +02008699
8700/* lookup the character, write the result into the writer.
8701 Return 1 if the result was written into the writer, return 0 if the mapping
8702 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008703static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008704charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8705 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706{
Victor Stinner1194ea02014-04-04 19:37:40 +02008707 PyObject *item;
8708
8709 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008711
8712 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008714 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008717 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008719
8720 if (item == Py_None) {
8721 Py_DECREF(item);
8722 return 0;
8723 }
8724
8725 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008726 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8727 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8728 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008729 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8730 Py_DECREF(item);
8731 return -1;
8732 }
8733 Py_DECREF(item);
8734 return 1;
8735 }
8736
8737 if (!PyUnicode_Check(item)) {
8738 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008740 }
8741
8742 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8743 Py_DECREF(item);
8744 return -1;
8745 }
8746
8747 Py_DECREF(item);
8748 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749}
8750
Victor Stinner89a76ab2014-04-05 11:44:04 +02008751static int
8752unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8753 Py_UCS1 *translate)
8754{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008755 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008756 int ret = 0;
8757
Victor Stinner89a76ab2014-04-05 11:44:04 +02008758 if (charmaptranslate_lookup(ch, mapping, &item)) {
8759 return -1;
8760 }
8761
8762 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008763 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008764 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008765 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008766 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008767 /* not found => default to 1:1 mapping */
8768 translate[ch] = ch;
8769 return 1;
8770 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008771 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008772 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008773 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8774 used it */
8775 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008776 /* invalid character or character outside ASCII:
8777 skip the fast translate */
8778 goto exit;
8779 }
8780 translate[ch] = (Py_UCS1)replace;
8781 }
8782 else if (PyUnicode_Check(item)) {
8783 Py_UCS4 replace;
8784
8785 if (PyUnicode_READY(item) == -1) {
8786 Py_DECREF(item);
8787 return -1;
8788 }
8789 if (PyUnicode_GET_LENGTH(item) != 1)
8790 goto exit;
8791
8792 replace = PyUnicode_READ_CHAR(item, 0);
8793 if (replace > 127)
8794 goto exit;
8795 translate[ch] = (Py_UCS1)replace;
8796 }
8797 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008798 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008799 goto exit;
8800 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008801 ret = 1;
8802
Benjamin Peterson1365de72014-04-07 20:15:41 -04008803 exit:
8804 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805 return ret;
8806}
8807
8808/* Fast path for ascii => ascii translation. Return 1 if the whole string
8809 was translated into writer, return 0 if the input string was partially
8810 translated into writer, raise an exception and return -1 on error. */
8811static int
8812unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008813 _PyUnicodeWriter *writer, int ignore,
8814 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815{
Victor Stinner872b2912014-04-05 14:27:07 +02008816 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008817 Py_ssize_t len;
8818 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008819 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 len = PyUnicode_GET_LENGTH(input);
8822
Victor Stinner872b2912014-04-05 14:27:07 +02008823 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824
8825 in = PyUnicode_1BYTE_DATA(input);
8826 end = in + len;
8827
8828 assert(PyUnicode_IS_ASCII(writer->buffer));
8829 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8830 out = PyUnicode_1BYTE_DATA(writer->buffer);
8831
Victor Stinner872b2912014-04-05 14:27:07 +02008832 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008834 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008836 int translate = unicode_fast_translate_lookup(mapping, ch,
8837 ascii_table);
8838 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008840 if (translate == 0)
8841 goto exit;
8842 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 }
Victor Stinner872b2912014-04-05 14:27:07 +02008844 if (ch2 == 0xfe) {
8845 if (ignore)
8846 continue;
8847 goto exit;
8848 }
8849 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008851 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 }
Victor Stinner872b2912014-04-05 14:27:07 +02008853 res = 1;
8854
8855exit:
8856 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008857 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008858 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859}
8860
Victor Stinner3222da22015-10-01 22:07:32 +02008861static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862_PyUnicode_TranslateCharmap(PyObject *input,
8863 PyObject *mapping,
8864 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008867 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 Py_ssize_t size, i;
8869 int kind;
8870 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008871 _PyUnicodeWriter writer;
8872 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008873 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874 PyObject *errorHandler = NULL;
8875 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008876 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 PyErr_BadArgument();
8881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (PyUnicode_READY(input) == -1)
8885 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 kind = PyUnicode_KIND(input);
8888 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008890 if (size == 0)
8891 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 /* allocate enough for a simple 1:1 translation without
8894 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008895 _PyUnicodeWriter_Init(&writer);
8896 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898
Victor Stinner872b2912014-04-05 14:27:07 +02008899 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8900
Victor Stinner33798672016-03-01 21:59:58 +01008901 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008903 if (PyUnicode_IS_ASCII(input)) {
8904 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8905 if (res < 0) {
8906 _PyUnicodeWriter_Dealloc(&writer);
8907 return NULL;
8908 }
8909 if (res == 1)
8910 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911 }
Victor Stinner33798672016-03-01 21:59:58 +01008912 else {
8913 i = 0;
8914 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008918 int translate;
8919 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8920 Py_ssize_t newpos;
8921 /* startpos for collecting untranslatable chars */
8922 Py_ssize_t collstart;
8923 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 ch = PyUnicode_READ(kind, data, i);
8927 translate = charmaptranslate_output(ch, mapping, &writer);
8928 if (translate < 0)
8929 goto onError;
8930
8931 if (translate != 0) {
8932 /* it worked => adjust input pointer */
8933 ++i;
8934 continue;
8935 }
8936
8937 /* untranslatable character */
8938 collstart = i;
8939 collend = i+1;
8940
8941 /* find all untranslatable characters */
8942 while (collend < size) {
8943 PyObject *x;
8944 ch = PyUnicode_READ(kind, data, collend);
8945 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008946 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008947 Py_XDECREF(x);
8948 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 ++collend;
8951 }
8952
8953 if (ignore) {
8954 i = collend;
8955 }
8956 else {
8957 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8958 reason, input, &exc,
8959 collstart, collend, &newpos);
8960 if (repunicode == NULL)
8961 goto onError;
8962 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008965 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 Py_DECREF(repunicode);
8967 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008968 }
8969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970 Py_XDECREF(exc);
8971 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008972 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008975 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976 Py_XDECREF(exc);
8977 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 return NULL;
8979}
8980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981/* Deprecated. Use PyUnicode_Translate instead. */
8982PyObject *
8983PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8984 Py_ssize_t size,
8985 PyObject *mapping,
8986 const char *errors)
8987{
Christian Heimes5f520f42012-09-11 14:03:25 +02008988 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008989 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 if (!unicode)
8991 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008992 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8993 Py_DECREF(unicode);
8994 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995}
8996
Alexander Belopolsky40018472011-02-26 01:02:56 +00008997PyObject *
8998PyUnicode_Translate(PyObject *str,
8999 PyObject *mapping,
9000 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009002 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009003 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009004 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005}
Tim Petersced69f82003-09-16 20:30:58 +00009006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007PyObject *
9008_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9009{
9010 if (!PyUnicode_Check(unicode)) {
9011 PyErr_BadInternalCall();
9012 return NULL;
9013 }
9014 if (PyUnicode_READY(unicode) == -1)
9015 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009016 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 /* If the string is already ASCII, just return the same string */
9018 Py_INCREF(unicode);
9019 return unicode;
9020 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009021
9022 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9023 PyObject *result = PyUnicode_New(len, 127);
9024 if (result == NULL) {
9025 return NULL;
9026 }
9027
9028 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9029 int kind = PyUnicode_KIND(unicode);
9030 const void *data = PyUnicode_DATA(unicode);
9031 Py_ssize_t i;
9032 for (i = 0; i < len; ++i) {
9033 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9034 if (ch < 127) {
9035 out[i] = ch;
9036 }
9037 else if (Py_UNICODE_ISSPACE(ch)) {
9038 out[i] = ' ';
9039 }
9040 else {
9041 int decimal = Py_UNICODE_TODECIMAL(ch);
9042 if (decimal < 0) {
9043 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009044 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009045 _PyUnicode_LENGTH(result) = i + 1;
9046 break;
9047 }
9048 out[i] = '0' + decimal;
9049 }
9050 }
9051
INADA Naoki16dfca42018-07-14 12:06:43 +09009052 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009053 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054}
9055
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009056PyObject *
9057PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9058 Py_ssize_t length)
9059{
Victor Stinnerf0124502011-11-21 23:12:56 +01009060 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009061 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009062 Py_UCS4 maxchar;
9063 enum PyUnicode_Kind kind;
9064 void *data;
9065
Victor Stinner99d7ad02012-02-22 13:37:39 +01009066 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009067 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009068 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009069 if (ch > 127) {
9070 int decimal = Py_UNICODE_TODECIMAL(ch);
9071 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009072 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009073 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009074 }
9075 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009076
9077 /* Copy to a new string */
9078 decimal = PyUnicode_New(length, maxchar);
9079 if (decimal == NULL)
9080 return decimal;
9081 kind = PyUnicode_KIND(decimal);
9082 data = PyUnicode_DATA(decimal);
9083 /* Iterate over code points */
9084 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009085 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009086 if (ch > 127) {
9087 int decimal = Py_UNICODE_TODECIMAL(ch);
9088 if (decimal >= 0)
9089 ch = '0' + decimal;
9090 }
9091 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009093 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009094}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009095/* --- Decimal Encoder ---------------------------------------------------- */
9096
Alexander Belopolsky40018472011-02-26 01:02:56 +00009097int
9098PyUnicode_EncodeDecimal(Py_UNICODE *s,
9099 Py_ssize_t length,
9100 char *output,
9101 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009102{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009103 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009104 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009105 enum PyUnicode_Kind kind;
9106 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009107
9108 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 PyErr_BadArgument();
9110 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009111 }
9112
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009113 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009114 if (unicode == NULL)
9115 return -1;
9116
Victor Stinner42bf7752011-11-21 22:52:58 +01009117 kind = PyUnicode_KIND(unicode);
9118 data = PyUnicode_DATA(unicode);
9119
Victor Stinnerb84d7232011-11-22 01:50:07 +01009120 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009121 PyObject *exc;
9122 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009124 Py_ssize_t startpos;
9125
9126 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009127
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009130 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 decimal = Py_UNICODE_TODECIMAL(ch);
9134 if (decimal >= 0) {
9135 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009136 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 continue;
9138 }
9139 if (0 < ch && ch < 256) {
9140 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009141 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 continue;
9143 }
Victor Stinner6345be92011-11-25 20:09:01 +01009144
Victor Stinner42bf7752011-11-21 22:52:58 +01009145 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009146 exc = NULL;
9147 raise_encode_exception(&exc, "decimal", unicode,
9148 startpos, startpos+1,
9149 "invalid decimal Unicode string");
9150 Py_XDECREF(exc);
9151 Py_DECREF(unicode);
9152 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009153 }
9154 /* 0-terminate the output string */
9155 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009156 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009158}
9159
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160/* --- Helpers ------------------------------------------------------------ */
9161
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009162/* helper macro to fixup start/end slice values */
9163#define ADJUST_INDICES(start, end, len) \
9164 if (end > len) \
9165 end = len; \
9166 else if (end < 0) { \
9167 end += len; \
9168 if (end < 0) \
9169 end = 0; \
9170 } \
9171 if (start < 0) { \
9172 start += len; \
9173 if (start < 0) \
9174 start = 0; \
9175 }
9176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009178any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009180 Py_ssize_t end,
9181 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009183 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 void *buf1, *buf2;
9185 Py_ssize_t len1, len2, result;
9186
9187 kind1 = PyUnicode_KIND(s1);
9188 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009189 if (kind1 < kind2)
9190 return -1;
9191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 len1 = PyUnicode_GET_LENGTH(s1);
9193 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009194 ADJUST_INDICES(start, end, len1);
9195 if (end - start < len2)
9196 return -1;
9197
9198 buf1 = PyUnicode_DATA(s1);
9199 buf2 = PyUnicode_DATA(s2);
9200 if (len2 == 1) {
9201 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9202 result = findchar((const char *)buf1 + kind1*start,
9203 kind1, end - start, ch, direction);
9204 if (result == -1)
9205 return -1;
9206 else
9207 return start + result;
9208 }
9209
9210 if (kind2 != kind1) {
9211 buf2 = _PyUnicode_AsKind(s2, kind1);
9212 if (!buf2)
9213 return -2;
9214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215
Victor Stinner794d5672011-10-10 03:21:36 +02009216 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009217 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009218 case PyUnicode_1BYTE_KIND:
9219 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9220 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9221 else
9222 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 case PyUnicode_2BYTE_KIND:
9225 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9226 break;
9227 case PyUnicode_4BYTE_KIND:
9228 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9229 break;
9230 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009231 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009232 }
9233 }
9234 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009235 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009236 case PyUnicode_1BYTE_KIND:
9237 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9238 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9239 else
9240 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 case PyUnicode_2BYTE_KIND:
9243 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 case PyUnicode_4BYTE_KIND:
9246 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9247 break;
9248 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009249 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 }
9252
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009253 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 PyMem_Free(buf2);
9255
9256 return result;
9257}
9258
Victor Stinner59423e32018-11-26 13:40:01 +01009259/* _PyUnicode_InsertThousandsGrouping() helper functions */
9260#include "stringlib/localeutil.h"
9261
9262/**
9263 * InsertThousandsGrouping:
9264 * @writer: Unicode writer.
9265 * @n_buffer: Number of characters in @buffer.
9266 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9267 * @d_pos: Start of digits string.
9268 * @n_digits: The number of digits in the string, in which we want
9269 * to put the grouping chars.
9270 * @min_width: The minimum width of the digits in the output string.
9271 * Output will be zero-padded on the left to fill.
9272 * @grouping: see definition in localeconv().
9273 * @thousands_sep: see definition in localeconv().
9274 *
9275 * There are 2 modes: counting and filling. If @writer is NULL,
9276 * we are in counting mode, else filling mode.
9277 * If counting, the required buffer size is returned.
9278 * If filling, we know the buffer will be large enough, so we don't
9279 * need to pass in the buffer size.
9280 * Inserts thousand grouping characters (as defined by grouping and
9281 * thousands_sep) into @writer.
9282 *
9283 * Return value: -1 on error, number of characters otherwise.
9284 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009286_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009287 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009288 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009289 PyObject *digits,
9290 Py_ssize_t d_pos,
9291 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009292 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009293 const char *grouping,
9294 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009295 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296{
Xtreak3f7983a2019-01-07 20:39:14 +05309297 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009298 if (writer) {
9299 assert(digits != NULL);
9300 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009301 }
9302 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009303 assert(digits == NULL);
9304 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009305 }
Victor Stinner59423e32018-11-26 13:40:01 +01009306 assert(0 <= d_pos);
9307 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009308 assert(grouping != NULL);
9309
9310 if (digits != NULL) {
9311 if (PyUnicode_READY(digits) == -1) {
9312 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009313 }
Victor Stinner59423e32018-11-26 13:40:01 +01009314 }
9315 if (PyUnicode_READY(thousands_sep) == -1) {
9316 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009317 }
9318
Victor Stinner59423e32018-11-26 13:40:01 +01009319 Py_ssize_t count = 0;
9320 Py_ssize_t n_zeros;
9321 int loop_broken = 0;
9322 int use_separator = 0; /* First time through, don't append the
9323 separator. They only go between
9324 groups. */
9325 Py_ssize_t buffer_pos;
9326 Py_ssize_t digits_pos;
9327 Py_ssize_t len;
9328 Py_ssize_t n_chars;
9329 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9330 be looked at */
9331 /* A generator that returns all of the grouping widths, until it
9332 returns 0. */
9333 GroupGenerator groupgen;
9334 GroupGenerator_init(&groupgen, grouping);
9335 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9336
9337 /* if digits are not grouped, thousands separator
9338 should be an empty string */
9339 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9340
9341 digits_pos = d_pos + n_digits;
9342 if (writer) {
9343 buffer_pos = writer->pos + n_buffer;
9344 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9345 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 }
Victor Stinner59423e32018-11-26 13:40:01 +01009347 else {
9348 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009349 }
Victor Stinner59423e32018-11-26 13:40:01 +01009350
9351 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009352 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 }
Victor Stinner59423e32018-11-26 13:40:01 +01009354
9355 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9356 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9357 n_zeros = Py_MAX(0, len - remaining);
9358 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9359
9360 /* Use n_zero zero's and n_chars chars */
9361
9362 /* Count only, don't do anything. */
9363 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9364
9365 /* Copy into the writer. */
9366 InsertThousandsGrouping_fill(writer, &buffer_pos,
9367 digits, &digits_pos,
9368 n_chars, n_zeros,
9369 use_separator ? thousands_sep : NULL,
9370 thousands_sep_len, maxchar);
9371
9372 /* Use a separator next time. */
9373 use_separator = 1;
9374
9375 remaining -= n_chars;
9376 min_width -= len;
9377
9378 if (remaining <= 0 && min_width <= 0) {
9379 loop_broken = 1;
9380 break;
9381 }
9382 min_width -= thousands_sep_len;
9383 }
9384 if (!loop_broken) {
9385 /* We left the loop without using a break statement. */
9386
9387 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9388 n_zeros = Py_MAX(0, len - remaining);
9389 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9390
9391 /* Use n_zero zero's and n_chars chars */
9392 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9393
9394 /* Copy into the writer. */
9395 InsertThousandsGrouping_fill(writer, &buffer_pos,
9396 digits, &digits_pos,
9397 n_chars, n_zeros,
9398 use_separator ? thousands_sep : NULL,
9399 thousands_sep_len, maxchar);
9400 }
9401 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402}
9403
9404
Alexander Belopolsky40018472011-02-26 01:02:56 +00009405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009411 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009464 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyMem_Free(buf2);
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009472 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 PyMem_Free(buf2);
9474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009496 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (PyUnicode_READY(str) == -1)
9498 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009499 len = PyUnicode_GET_LENGTH(str);
9500 ADJUST_INDICES(start, end, len);
9501 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009504 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9505 kind, end-start, ch, direction);
9506 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 else
9509 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510}
9511
Alexander Belopolsky40018472011-02-26 01:02:56 +00009512static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009513tailmatch(PyObject *self,
9514 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009515 Py_ssize_t start,
9516 Py_ssize_t end,
9517 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 int kind_self;
9520 int kind_sub;
9521 void *data_self;
9522 void *data_sub;
9523 Py_ssize_t offset;
9524 Py_ssize_t i;
9525 Py_ssize_t end_sub;
9526
9527 if (PyUnicode_READY(self) == -1 ||
9528 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009529 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9532 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009536 if (PyUnicode_GET_LENGTH(substring) == 0)
9537 return 1;
9538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 kind_self = PyUnicode_KIND(self);
9540 data_self = PyUnicode_DATA(self);
9541 kind_sub = PyUnicode_KIND(substring);
9542 data_sub = PyUnicode_DATA(substring);
9543 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9544
9545 if (direction > 0)
9546 offset = end;
9547 else
9548 offset = start;
9549
9550 if (PyUnicode_READ(kind_self, data_self, offset) ==
9551 PyUnicode_READ(kind_sub, data_sub, 0) &&
9552 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9553 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9554 /* If both are of the same kind, memcmp is sufficient */
9555 if (kind_self == kind_sub) {
9556 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009557 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 data_sub,
9559 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009560 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009562 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 else {
9564 /* We do not need to compare 0 and len(substring)-1 because
9565 the if statement above ensured already that they are equal
9566 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 for (i = 1; i < end_sub; ++i) {
9568 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9569 PyUnicode_READ(kind_sub, data_sub, i))
9570 return 0;
9571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 }
9575
9576 return 0;
9577}
9578
Alexander Belopolsky40018472011-02-26 01:02:56 +00009579Py_ssize_t
9580PyUnicode_Tailmatch(PyObject *str,
9581 PyObject *substr,
9582 Py_ssize_t start,
9583 Py_ssize_t end,
9584 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009588
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590}
9591
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009592static PyObject *
9593ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009595 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9596 char *resdata, *data = PyUnicode_DATA(self);
9597 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009598
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009599 res = PyUnicode_New(len, 127);
9600 if (res == NULL)
9601 return NULL;
9602 resdata = PyUnicode_DATA(res);
9603 if (lower)
9604 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606 _Py_bytes_upper(resdata, data, len);
9607 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608}
9609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009611handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613 Py_ssize_t j;
9614 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009615 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009616 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9619
9620 where ! is a negation and \p{xxx} is a character with property xxx.
9621 */
9622 for (j = i - 1; j >= 0; j--) {
9623 c = PyUnicode_READ(kind, data, j);
9624 if (!_PyUnicode_IsCaseIgnorable(c))
9625 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9628 if (final_sigma) {
9629 for (j = i + 1; j < length; j++) {
9630 c = PyUnicode_READ(kind, data, j);
9631 if (!_PyUnicode_IsCaseIgnorable(c))
9632 break;
9633 }
9634 final_sigma = j == length || !_PyUnicode_IsCased(c);
9635 }
9636 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639static int
9640lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9641 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643 /* Obscure special case. */
9644 if (c == 0x3A3) {
9645 mapped[0] = handle_capital_sigma(kind, data, length, i);
9646 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649}
9650
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651static Py_ssize_t
9652do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654 Py_ssize_t i, k = 0;
9655 int n_res, j;
9656 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 c = PyUnicode_READ(kind, data, 0);
9659 n_res = _PyUnicode_ToUpperFull(c, mapped);
9660 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009661 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 for (i = 1; i < length; i++) {
9665 c = PyUnicode_READ(kind, data, i);
9666 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9667 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009668 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009670 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675static Py_ssize_t
9676do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9677 Py_ssize_t i, k = 0;
9678
9679 for (i = 0; i < length; i++) {
9680 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9681 int n_res, j;
9682 if (Py_UNICODE_ISUPPER(c)) {
9683 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9684 }
9685 else if (Py_UNICODE_ISLOWER(c)) {
9686 n_res = _PyUnicode_ToUpperFull(c, mapped);
9687 }
9688 else {
9689 n_res = 1;
9690 mapped[0] = c;
9691 }
9692 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009693 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 res[k++] = mapped[j];
9695 }
9696 }
9697 return k;
9698}
9699
9700static Py_ssize_t
9701do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9702 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 Py_ssize_t i, k = 0;
9705
9706 for (i = 0; i < length; i++) {
9707 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9708 int n_res, j;
9709 if (lower)
9710 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9711 else
9712 n_res = _PyUnicode_ToUpperFull(c, mapped);
9713 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009714 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 res[k++] = mapped[j];
9716 }
9717 }
9718 return k;
9719}
9720
9721static Py_ssize_t
9722do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9723{
9724 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9725}
9726
9727static Py_ssize_t
9728do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9729{
9730 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9731}
9732
Benjamin Petersone51757f2012-01-12 21:10:29 -05009733static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009734do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9735{
9736 Py_ssize_t i, k = 0;
9737
9738 for (i = 0; i < length; i++) {
9739 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9740 Py_UCS4 mapped[3];
9741 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9742 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009743 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009744 res[k++] = mapped[j];
9745 }
9746 }
9747 return k;
9748}
9749
9750static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009751do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9752{
9753 Py_ssize_t i, k = 0;
9754 int previous_is_cased;
9755
9756 previous_is_cased = 0;
9757 for (i = 0; i < length; i++) {
9758 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9759 Py_UCS4 mapped[3];
9760 int n_res, j;
9761
9762 if (previous_is_cased)
9763 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9764 else
9765 n_res = _PyUnicode_ToTitleFull(c, mapped);
9766
9767 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009768 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009769 res[k++] = mapped[j];
9770 }
9771
9772 previous_is_cased = _PyUnicode_IsCased(c);
9773 }
9774 return k;
9775}
9776
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777static PyObject *
9778case_operation(PyObject *self,
9779 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9780{
9781 PyObject *res = NULL;
9782 Py_ssize_t length, newlength = 0;
9783 int kind, outkind;
9784 void *data, *outdata;
9785 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9786
Benjamin Petersoneea48462012-01-16 14:28:50 -05009787 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788
9789 kind = PyUnicode_KIND(self);
9790 data = PyUnicode_DATA(self);
9791 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009792 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009793 PyErr_SetString(PyExc_OverflowError, "string is too long");
9794 return NULL;
9795 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009796 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797 if (tmp == NULL)
9798 return PyErr_NoMemory();
9799 newlength = perform(kind, data, length, tmp, &maxchar);
9800 res = PyUnicode_New(newlength, maxchar);
9801 if (res == NULL)
9802 goto leave;
9803 tmpend = tmp + newlength;
9804 outdata = PyUnicode_DATA(res);
9805 outkind = PyUnicode_KIND(res);
9806 switch (outkind) {
9807 case PyUnicode_1BYTE_KIND:
9808 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9809 break;
9810 case PyUnicode_2BYTE_KIND:
9811 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9812 break;
9813 case PyUnicode_4BYTE_KIND:
9814 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9815 break;
9816 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009817 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 }
9819 leave:
9820 PyMem_FREE(tmp);
9821 return res;
9822}
9823
Tim Peters8ce9f162004-08-27 01:49:32 +00009824PyObject *
9825PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009827 PyObject *res;
9828 PyObject *fseq;
9829 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009830 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009832 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009833 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009835 }
9836
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009837 /* NOTE: the following code can't call back into Python code,
9838 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009839 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009840
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009841 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009842 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009843 res = _PyUnicode_JoinArray(separator, items, seqlen);
9844 Py_DECREF(fseq);
9845 return res;
9846}
9847
9848PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009849_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009850{
9851 PyObject *res = NULL; /* the result */
9852 PyObject *sep = NULL;
9853 Py_ssize_t seplen;
9854 PyObject *item;
9855 Py_ssize_t sz, i, res_offset;
9856 Py_UCS4 maxchar;
9857 Py_UCS4 item_maxchar;
9858 int use_memcpy;
9859 unsigned char *res_data = NULL, *sep_data = NULL;
9860 PyObject *last_obj;
9861 unsigned int kind = 0;
9862
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 /* If empty sequence, return u"". */
9864 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009865 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009867
Tim Peters05eba1f2004-08-27 21:32:02 +00009868 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009869 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009870 if (seqlen == 1) {
9871 if (PyUnicode_CheckExact(items[0])) {
9872 res = items[0];
9873 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009874 return res;
9875 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009876 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009877 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009878 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009879 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009880 /* Set up sep and seplen */
9881 if (separator == NULL) {
9882 /* fall back to a blank space separator */
9883 sep = PyUnicode_FromOrdinal(' ');
9884 if (!sep)
9885 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009886 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009887 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009889 else {
9890 if (!PyUnicode_Check(separator)) {
9891 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009892 "separator: expected str instance,"
9893 " %.80s found",
9894 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009895 goto onError;
9896 }
9897 if (PyUnicode_READY(separator))
9898 goto onError;
9899 sep = separator;
9900 seplen = PyUnicode_GET_LENGTH(separator);
9901 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9902 /* inc refcount to keep this code path symmetric with the
9903 above case of a blank separator */
9904 Py_INCREF(sep);
9905 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009906 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 }
9908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909 /* There are at least two things to join, or else we have a subclass
9910 * of str in the sequence.
9911 * Do a pre-pass to figure out the total amount of space we'll
9912 * need (sz), and see whether all argument are strings.
9913 */
9914 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009915#ifdef Py_DEBUG
9916 use_memcpy = 0;
9917#else
9918 use_memcpy = 1;
9919#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009920 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009921 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009922 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 if (!PyUnicode_Check(item)) {
9924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009925 "sequence item %zd: expected str instance,"
9926 " %.80s found",
9927 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 goto onError;
9929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (PyUnicode_READY(item) == -1)
9931 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009932 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009934 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009935 if (i != 0) {
9936 add_sz += seplen;
9937 }
9938 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009940 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 goto onError;
9942 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009943 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 if (use_memcpy && last_obj != NULL) {
9945 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9946 use_memcpy = 0;
9947 }
9948 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009949 }
Tim Petersced69f82003-09-16 20:30:58 +00009950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 if (res == NULL)
9953 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009956#ifdef Py_DEBUG
9957 use_memcpy = 0;
9958#else
9959 if (use_memcpy) {
9960 res_data = PyUnicode_1BYTE_DATA(res);
9961 kind = PyUnicode_KIND(res);
9962 if (seplen != 0)
9963 sep_data = PyUnicode_1BYTE_DATA(sep);
9964 }
9965#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009966 if (use_memcpy) {
9967 for (i = 0; i < seqlen; ++i) {
9968 Py_ssize_t itemlen;
9969 item = items[i];
9970
9971 /* Copy item, and maybe the separator. */
9972 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009973 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009975 kind * seplen);
9976 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009978
9979 itemlen = PyUnicode_GET_LENGTH(item);
9980 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009981 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009982 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009983 kind * itemlen);
9984 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009985 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009986 }
9987 assert(res_data == PyUnicode_1BYTE_DATA(res)
9988 + kind * PyUnicode_GET_LENGTH(res));
9989 }
9990 else {
9991 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9992 Py_ssize_t itemlen;
9993 item = items[i];
9994
9995 /* Copy item, and maybe the separator. */
9996 if (i && seplen != 0) {
9997 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9998 res_offset += seplen;
9999 }
10000
10001 itemlen = PyUnicode_GET_LENGTH(item);
10002 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010003 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 res_offset += itemlen;
10005 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010006 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010008 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010011 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010016 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 return NULL;
10018}
10019
Victor Stinnerd3f08822012-05-29 12:57:52 +020010020void
10021_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10022 Py_UCS4 fill_char)
10023{
10024 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010025 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010026 assert(PyUnicode_IS_READY(unicode));
10027 assert(unicode_modifiable(unicode));
10028 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10029 assert(start >= 0);
10030 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010031 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010032}
10033
Victor Stinner3fe55312012-01-04 00:33:50 +010010034Py_ssize_t
10035PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10036 Py_UCS4 fill_char)
10037{
10038 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010039
10040 if (!PyUnicode_Check(unicode)) {
10041 PyErr_BadInternalCall();
10042 return -1;
10043 }
10044 if (PyUnicode_READY(unicode) == -1)
10045 return -1;
10046 if (unicode_check_modifiable(unicode))
10047 return -1;
10048
Victor Stinnerd3f08822012-05-29 12:57:52 +020010049 if (start < 0) {
10050 PyErr_SetString(PyExc_IndexError, "string index out of range");
10051 return -1;
10052 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010053 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10054 PyErr_SetString(PyExc_ValueError,
10055 "fill character is bigger than "
10056 "the string maximum character");
10057 return -1;
10058 }
10059
10060 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10061 length = Py_MIN(maxlen, length);
10062 if (length <= 0)
10063 return 0;
10064
Victor Stinnerd3f08822012-05-29 12:57:52 +020010065 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010066 return length;
10067}
10068
Victor Stinner9310abb2011-10-05 00:59:23 +020010069static PyObject *
10070pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010071 Py_ssize_t left,
10072 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 PyObject *u;
10076 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010077 int kind;
10078 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079
10080 if (left < 0)
10081 left = 0;
10082 if (right < 0)
10083 right = 0;
10084
Victor Stinnerc4b49542011-12-11 22:44:26 +010010085 if (left == 0 && right == 0)
10086 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10089 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010090 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10091 return NULL;
10092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010094 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010096 if (!u)
10097 return NULL;
10098
10099 kind = PyUnicode_KIND(u);
10100 data = PyUnicode_DATA(u);
10101 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010102 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010103 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010104 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010105 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010106 assert(_PyUnicode_CheckConsistency(u, 1));
10107 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108}
10109
Alexander Belopolsky40018472011-02-26 01:02:56 +000010110PyObject *
10111PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010115 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117
Benjamin Petersonead6b532011-12-20 17:23:42 -060010118 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010120 if (PyUnicode_IS_ASCII(string))
10121 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010122 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010123 PyUnicode_GET_LENGTH(string), keepends);
10124 else
10125 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010126 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010127 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 break;
10129 case PyUnicode_2BYTE_KIND:
10130 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010131 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 PyUnicode_GET_LENGTH(string), keepends);
10133 break;
10134 case PyUnicode_4BYTE_KIND:
10135 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 PyUnicode_GET_LENGTH(string), keepends);
10138 break;
10139 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010140 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143}
10144
Alexander Belopolsky40018472011-02-26 01:02:56 +000010145static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010146split(PyObject *self,
10147 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010148 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010150 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 void *buf1, *buf2;
10152 Py_ssize_t len1, len2;
10153 PyObject* out;
10154
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010156 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (PyUnicode_READY(self) == -1)
10159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010162 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 if (PyUnicode_IS_ASCII(self))
10165 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010167 PyUnicode_GET_LENGTH(self), maxcount
10168 );
10169 else
10170 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010171 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010172 PyUnicode_GET_LENGTH(self), maxcount
10173 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 case PyUnicode_2BYTE_KIND:
10175 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010176 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 PyUnicode_GET_LENGTH(self), maxcount
10178 );
10179 case PyUnicode_4BYTE_KIND:
10180 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 PyUnicode_GET_LENGTH(self), maxcount
10183 );
10184 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010185 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 }
10187
10188 if (PyUnicode_READY(substring) == -1)
10189 return NULL;
10190
10191 kind1 = PyUnicode_KIND(self);
10192 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 len1 = PyUnicode_GET_LENGTH(self);
10194 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010195 if (kind1 < kind2 || len1 < len2) {
10196 out = PyList_New(1);
10197 if (out == NULL)
10198 return NULL;
10199 Py_INCREF(self);
10200 PyList_SET_ITEM(out, 0, self);
10201 return out;
10202 }
10203 buf1 = PyUnicode_DATA(self);
10204 buf2 = PyUnicode_DATA(substring);
10205 if (kind2 != kind1) {
10206 buf2 = _PyUnicode_AsKind(substring, kind1);
10207 if (!buf2)
10208 return NULL;
10209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010211 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10214 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 else
10217 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 break;
10220 case PyUnicode_2BYTE_KIND:
10221 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 break;
10224 case PyUnicode_4BYTE_KIND:
10225 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 break;
10228 default:
10229 out = NULL;
10230 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010231 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 PyMem_Free(buf2);
10233 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010237rsplit(PyObject *self,
10238 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010240{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010241 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 void *buf1, *buf2;
10243 Py_ssize_t len1, len2;
10244 PyObject* out;
10245
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010246 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010247 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (PyUnicode_READY(self) == -1)
10250 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010253 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 if (PyUnicode_IS_ASCII(self))
10256 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
10260 else
10261 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 case PyUnicode_2BYTE_KIND:
10266 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 case PyUnicode_4BYTE_KIND:
10271 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010276 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 }
10278
10279 if (PyUnicode_READY(substring) == -1)
10280 return NULL;
10281
10282 kind1 = PyUnicode_KIND(self);
10283 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 len1 = PyUnicode_GET_LENGTH(self);
10285 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010286 if (kind1 < kind2 || len1 < len2) {
10287 out = PyList_New(1);
10288 if (out == NULL)
10289 return NULL;
10290 Py_INCREF(self);
10291 PyList_SET_ITEM(out, 0, self);
10292 return out;
10293 }
10294 buf1 = PyUnicode_DATA(self);
10295 buf2 = PyUnicode_DATA(substring);
10296 if (kind2 != kind1) {
10297 buf2 = _PyUnicode_AsKind(substring, kind1);
10298 if (!buf2)
10299 return NULL;
10300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010302 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10305 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010306 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 else
10308 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 case PyUnicode_2BYTE_KIND:
10312 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 case PyUnicode_4BYTE_KIND:
10316 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 break;
10319 default:
10320 out = NULL;
10321 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010322 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 PyMem_Free(buf2);
10324 return out;
10325}
10326
10327static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10329 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010331 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10334 return asciilib_find(buf1, len1, buf2, len2, offset);
10335 else
10336 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_2BYTE_KIND:
10338 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10341 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010342 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343}
10344
10345static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10347 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010349 switch (kind) {
10350 case PyUnicode_1BYTE_KIND:
10351 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10352 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10353 else
10354 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10355 case PyUnicode_2BYTE_KIND:
10356 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10357 case PyUnicode_4BYTE_KIND:
10358 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10359 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010360 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010361}
10362
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010363static void
10364replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10365 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10366{
10367 int kind = PyUnicode_KIND(u);
10368 void *data = PyUnicode_DATA(u);
10369 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10370 if (kind == PyUnicode_1BYTE_KIND) {
10371 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10372 (Py_UCS1 *)data + len,
10373 u1, u2, maxcount);
10374 }
10375 else if (kind == PyUnicode_2BYTE_KIND) {
10376 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10377 (Py_UCS2 *)data + len,
10378 u1, u2, maxcount);
10379 }
10380 else {
10381 assert(kind == PyUnicode_4BYTE_KIND);
10382 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10383 (Py_UCS4 *)data + len,
10384 u1, u2, maxcount);
10385 }
10386}
10387
Alexander Belopolsky40018472011-02-26 01:02:56 +000010388static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389replace(PyObject *self, PyObject *str1,
10390 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 PyObject *u;
10393 char *sbuf = PyUnicode_DATA(self);
10394 char *buf1 = PyUnicode_DATA(str1);
10395 char *buf2 = PyUnicode_DATA(str2);
10396 int srelease = 0, release1 = 0, release2 = 0;
10397 int skind = PyUnicode_KIND(self);
10398 int kind1 = PyUnicode_KIND(str1);
10399 int kind2 = PyUnicode_KIND(str2);
10400 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10401 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10402 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010403 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010404 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
10406 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010409 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
Victor Stinner59de0ee2011-10-07 10:01:28 +020010411 if (str1 == str2)
10412 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010415 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10416 if (maxchar < maxchar_str1)
10417 /* substring too wide to be present */
10418 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010419 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10420 /* Replacing str1 with str2 may cause a maxchar reduction in the
10421 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010422 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010423 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010428 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010431 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010432 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010433
Victor Stinner69ed0f42013-04-09 21:48:24 +020010434 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010435 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010436 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010438 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010442
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010443 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10444 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010445 }
10446 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 int rkind = skind;
10448 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010449 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (kind1 < rkind) {
10452 /* widen substring */
10453 buf1 = _PyUnicode_AsKind(str1, rkind);
10454 if (!buf1) goto error;
10455 release1 = 1;
10456 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010457 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010458 if (i < 0)
10459 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (rkind > kind2) {
10461 /* widen replacement */
10462 buf2 = _PyUnicode_AsKind(str2, rkind);
10463 if (!buf2) goto error;
10464 release2 = 1;
10465 }
10466 else if (rkind < kind2) {
10467 /* widen self and buf1 */
10468 rkind = kind2;
10469 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010470 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 sbuf = _PyUnicode_AsKind(self, rkind);
10472 if (!sbuf) goto error;
10473 srelease = 1;
10474 buf1 = _PyUnicode_AsKind(str1, rkind);
10475 if (!buf1) goto error;
10476 release1 = 1;
10477 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 u = PyUnicode_New(slen, maxchar);
10479 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010481 assert(PyUnicode_KIND(u) == rkind);
10482 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010483
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010484 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010485 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010486 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010488 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010490
10491 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010492 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010493 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010494 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010495 if (i == -1)
10496 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010497 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010499 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 }
10504 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010506 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 int rkind = skind;
10508 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010511 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf1 = _PyUnicode_AsKind(str1, rkind);
10513 if (!buf1) goto error;
10514 release1 = 1;
10515 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 if (n == 0)
10518 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 buf2 = _PyUnicode_AsKind(str2, rkind);
10522 if (!buf2) goto error;
10523 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 rkind = kind2;
10528 sbuf = _PyUnicode_AsKind(self, rkind);
10529 if (!sbuf) goto error;
10530 srelease = 1;
10531 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010532 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 buf1 = _PyUnicode_AsKind(str1, rkind);
10534 if (!buf1) goto error;
10535 release1 = 1;
10536 }
10537 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10538 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010539 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 PyErr_SetString(PyExc_OverflowError,
10541 "replace string is too long");
10542 goto error;
10543 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010544 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010545 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010546 _Py_INCREF_UNICODE_EMPTY();
10547 if (!unicode_empty)
10548 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 u = unicode_empty;
10550 goto done;
10551 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010552 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 PyErr_SetString(PyExc_OverflowError,
10554 "replace string is too long");
10555 goto error;
10556 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 u = PyUnicode_New(new_size, maxchar);
10558 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010560 assert(PyUnicode_KIND(u) == rkind);
10561 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 ires = i = 0;
10563 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010564 while (n-- > 0) {
10565 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010566 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010567 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010568 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010569 if (j == -1)
10570 break;
10571 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010572 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 memcpy(res + rkind * ires,
10574 sbuf + rkind * i,
10575 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010577 }
10578 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 memcpy(res + rkind * ires,
10590 sbuf + rkind * i,
10591 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010592 }
10593 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 /* interleave */
10595 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010598 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 if (--n <= 0)
10601 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010602 memcpy(res + rkind * ires,
10603 sbuf + rkind * i,
10604 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 ires++;
10606 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 memcpy(res + rkind * ires,
10609 sbuf + rkind * i,
10610 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 }
10613
10614 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010615 unicode_adjust_maxchar(&u);
10616 if (u == NULL)
10617 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010619
10620 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (srelease)
10622 PyMem_FREE(sbuf);
10623 if (release1)
10624 PyMem_FREE(buf1);
10625 if (release2)
10626 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010627 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 if (srelease)
10633 PyMem_FREE(sbuf);
10634 if (release1)
10635 PyMem_FREE(buf1);
10636 if (release2)
10637 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010638 return unicode_result_unchanged(self);
10639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 error:
10641 if (srelease && sbuf)
10642 PyMem_FREE(sbuf);
10643 if (release1 && buf1)
10644 PyMem_FREE(buf1);
10645 if (release2 && buf2)
10646 PyMem_FREE(buf2);
10647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648}
10649
10650/* --- Unicode Object Methods --------------------------------------------- */
10651
INADA Naoki3ae20562017-01-16 20:41:20 +090010652/*[clinic input]
10653str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654
INADA Naoki3ae20562017-01-16 20:41:20 +090010655Return a version of the string where each word is titlecased.
10656
10657More specifically, words start with uppercased characters and all remaining
10658cased characters have lower case.
10659[clinic start generated code]*/
10660
10661static PyObject *
10662unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010663/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010665 if (PyUnicode_READY(self) == -1)
10666 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010667 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668}
10669
INADA Naoki3ae20562017-01-16 20:41:20 +090010670/*[clinic input]
10671str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672
INADA Naoki3ae20562017-01-16 20:41:20 +090010673Return a capitalized version of the string.
10674
10675More specifically, make the first character have upper case and the rest lower
10676case.
10677[clinic start generated code]*/
10678
10679static PyObject *
10680unicode_capitalize_impl(PyObject *self)
10681/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
10685 if (PyUnicode_GET_LENGTH(self) == 0)
10686 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010687 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688}
10689
INADA Naoki3ae20562017-01-16 20:41:20 +090010690/*[clinic input]
10691str.casefold as unicode_casefold
10692
10693Return a version of the string suitable for caseless comparisons.
10694[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010695
10696static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010697unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010698/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010699{
10700 if (PyUnicode_READY(self) == -1)
10701 return NULL;
10702 if (PyUnicode_IS_ASCII(self))
10703 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010704 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010705}
10706
10707
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010708/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010709
10710static int
10711convert_uc(PyObject *obj, void *addr)
10712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010714
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010715 if (!PyUnicode_Check(obj)) {
10716 PyErr_Format(PyExc_TypeError,
10717 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010718 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010719 return 0;
10720 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010721 if (PyUnicode_READY(obj) < 0)
10722 return 0;
10723 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010724 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010726 return 0;
10727 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010728 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010729 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010730}
10731
INADA Naoki3ae20562017-01-16 20:41:20 +090010732/*[clinic input]
10733str.center as unicode_center
10734
10735 width: Py_ssize_t
10736 fillchar: Py_UCS4 = ' '
10737 /
10738
10739Return a centered string of length width.
10740
10741Padding is done using the specified fill character (default is a space).
10742[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
10744static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010745unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10746/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010748 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
Benjamin Petersonbac79492012-01-14 13:34:47 -050010750 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 return NULL;
10752
Victor Stinnerc4b49542011-12-11 22:44:26 +010010753 if (PyUnicode_GET_LENGTH(self) >= width)
10754 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
Victor Stinnerc4b49542011-12-11 22:44:26 +010010756 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 left = marg / 2 + (marg & width & 1);
10758
Victor Stinner9310abb2011-10-05 00:59:23 +020010759 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760}
10761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762/* This function assumes that str1 and str2 are readied by the caller. */
10763
Marc-André Lemburge5034372000-08-08 08:04:29 +000010764static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010765unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010766{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010767#define COMPARE(TYPE1, TYPE2) \
10768 do { \
10769 TYPE1* p1 = (TYPE1 *)data1; \
10770 TYPE2* p2 = (TYPE2 *)data2; \
10771 TYPE1* end = p1 + len; \
10772 Py_UCS4 c1, c2; \
10773 for (; p1 != end; p1++, p2++) { \
10774 c1 = *p1; \
10775 c2 = *p2; \
10776 if (c1 != c2) \
10777 return (c1 < c2) ? -1 : 1; \
10778 } \
10779 } \
10780 while (0)
10781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 int kind1, kind2;
10783 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010784 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 kind1 = PyUnicode_KIND(str1);
10787 kind2 = PyUnicode_KIND(str2);
10788 data1 = PyUnicode_DATA(str1);
10789 data2 = PyUnicode_DATA(str2);
10790 len1 = PyUnicode_GET_LENGTH(str1);
10791 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010792 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010793
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010794 switch(kind1) {
10795 case PyUnicode_1BYTE_KIND:
10796 {
10797 switch(kind2) {
10798 case PyUnicode_1BYTE_KIND:
10799 {
10800 int cmp = memcmp(data1, data2, len);
10801 /* normalize result of memcmp() into the range [-1; 1] */
10802 if (cmp < 0)
10803 return -1;
10804 if (cmp > 0)
10805 return 1;
10806 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010807 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010808 case PyUnicode_2BYTE_KIND:
10809 COMPARE(Py_UCS1, Py_UCS2);
10810 break;
10811 case PyUnicode_4BYTE_KIND:
10812 COMPARE(Py_UCS1, Py_UCS4);
10813 break;
10814 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010815 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010816 }
10817 break;
10818 }
10819 case PyUnicode_2BYTE_KIND:
10820 {
10821 switch(kind2) {
10822 case PyUnicode_1BYTE_KIND:
10823 COMPARE(Py_UCS2, Py_UCS1);
10824 break;
10825 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010826 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010827 COMPARE(Py_UCS2, Py_UCS2);
10828 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010829 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010830 case PyUnicode_4BYTE_KIND:
10831 COMPARE(Py_UCS2, Py_UCS4);
10832 break;
10833 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010834 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010835 }
10836 break;
10837 }
10838 case PyUnicode_4BYTE_KIND:
10839 {
10840 switch(kind2) {
10841 case PyUnicode_1BYTE_KIND:
10842 COMPARE(Py_UCS4, Py_UCS1);
10843 break;
10844 case PyUnicode_2BYTE_KIND:
10845 COMPARE(Py_UCS4, Py_UCS2);
10846 break;
10847 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010848 {
10849#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10850 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10851 /* normalize result of wmemcmp() into the range [-1; 1] */
10852 if (cmp < 0)
10853 return -1;
10854 if (cmp > 0)
10855 return 1;
10856#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010857 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010858#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010860 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010862 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010863 }
10864 break;
10865 }
10866 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010867 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010868 }
10869
Victor Stinner770e19e2012-10-04 22:59:45 +020010870 if (len1 == len2)
10871 return 0;
10872 if (len1 < len2)
10873 return -1;
10874 else
10875 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010876
10877#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010878}
10879
Benjamin Peterson621b4302016-09-09 13:54:34 -070010880static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010881unicode_compare_eq(PyObject *str1, PyObject *str2)
10882{
10883 int kind;
10884 void *data1, *data2;
10885 Py_ssize_t len;
10886 int cmp;
10887
Victor Stinnere5567ad2012-10-23 02:48:49 +020010888 len = PyUnicode_GET_LENGTH(str1);
10889 if (PyUnicode_GET_LENGTH(str2) != len)
10890 return 0;
10891 kind = PyUnicode_KIND(str1);
10892 if (PyUnicode_KIND(str2) != kind)
10893 return 0;
10894 data1 = PyUnicode_DATA(str1);
10895 data2 = PyUnicode_DATA(str2);
10896
10897 cmp = memcmp(data1, data2, len * kind);
10898 return (cmp == 0);
10899}
10900
10901
Alexander Belopolsky40018472011-02-26 01:02:56 +000010902int
10903PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10906 if (PyUnicode_READY(left) == -1 ||
10907 PyUnicode_READY(right) == -1)
10908 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010909
10910 /* a string is equal to itself */
10911 if (left == right)
10912 return 0;
10913
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010914 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010916 PyErr_Format(PyExc_TypeError,
10917 "Can't compare %.100s and %.100s",
10918 left->ob_type->tp_name,
10919 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920 return -1;
10921}
10922
Martin v. Löwis5b222132007-06-10 09:51:05 +000010923int
10924PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 Py_ssize_t i;
10927 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010929 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930
Victor Stinner910337b2011-10-03 03:20:16 +020010931 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010932 if (!PyUnicode_IS_READY(uni)) {
10933 const wchar_t *ws = _PyUnicode_WSTR(uni);
10934 /* Compare Unicode string and source character set string */
10935 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10936 if (chr != ustr[i])
10937 return (chr < ustr[i]) ? -1 : 1;
10938 }
10939 /* This check keeps Python strings that end in '\0' from comparing equal
10940 to C strings identical up to that point. */
10941 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10942 return 1; /* uni is longer */
10943 if (ustr[i])
10944 return -1; /* str is longer */
10945 return 0;
10946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010948 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010949 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010950 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010951 size_t len, len2 = strlen(str);
10952 int cmp;
10953
10954 len = Py_MIN(len1, len2);
10955 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010956 if (cmp != 0) {
10957 if (cmp < 0)
10958 return -1;
10959 else
10960 return 1;
10961 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010962 if (len1 > len2)
10963 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010964 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010965 return -1; /* str is longer */
10966 return 0;
10967 }
10968 else {
10969 void *data = PyUnicode_DATA(uni);
10970 /* Compare Unicode string and source character set string */
10971 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010972 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010973 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10974 /* This check keeps Python strings that end in '\0' from comparing equal
10975 to C strings identical up to that point. */
10976 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10977 return 1; /* uni is longer */
10978 if (str[i])
10979 return -1; /* str is longer */
10980 return 0;
10981 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010982}
10983
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010984static int
10985non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
10986{
10987 size_t i, len;
10988 const wchar_t *p;
10989 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
10990 if (strlen(str) != len)
10991 return 0;
10992 p = _PyUnicode_WSTR(unicode);
10993 assert(p);
10994 for (i = 0; i < len; i++) {
10995 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020010996 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010997 return 0;
10998 }
10999 return 1;
11000}
11001
11002int
11003_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11004{
11005 size_t len;
11006 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011007 assert(str);
11008#ifndef NDEBUG
11009 for (const char *p = str; *p; p++) {
11010 assert((unsigned char)*p < 128);
11011 }
11012#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011013 if (PyUnicode_READY(unicode) == -1) {
11014 /* Memory error or bad data */
11015 PyErr_Clear();
11016 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11017 }
11018 if (!PyUnicode_IS_ASCII(unicode))
11019 return 0;
11020 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11021 return strlen(str) == len &&
11022 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11023}
11024
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011025int
11026_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11027{
11028 PyObject *right_uni;
11029 Py_hash_t hash;
11030
11031 assert(_PyUnicode_CHECK(left));
11032 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011033#ifndef NDEBUG
11034 for (const char *p = right->string; *p; p++) {
11035 assert((unsigned char)*p < 128);
11036 }
11037#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011038
11039 if (PyUnicode_READY(left) == -1) {
11040 /* memory error or bad data */
11041 PyErr_Clear();
11042 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11043 }
11044
11045 if (!PyUnicode_IS_ASCII(left))
11046 return 0;
11047
11048 right_uni = _PyUnicode_FromId(right); /* borrowed */
11049 if (right_uni == NULL) {
11050 /* memory error or bad data */
11051 PyErr_Clear();
11052 return _PyUnicode_EqualToASCIIString(left, right->string);
11053 }
11054
11055 if (left == right_uni)
11056 return 1;
11057
11058 if (PyUnicode_CHECK_INTERNED(left))
11059 return 0;
11060
INADA Naoki7cc95f52018-01-28 02:07:09 +090011061 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011062 hash = _PyUnicode_HASH(left);
11063 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11064 return 0;
11065
11066 return unicode_compare_eq(left, right_uni);
11067}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069PyObject *
11070PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011071{
11072 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011073
Victor Stinnere5567ad2012-10-23 02:48:49 +020011074 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11075 Py_RETURN_NOTIMPLEMENTED;
11076
11077 if (PyUnicode_READY(left) == -1 ||
11078 PyUnicode_READY(right) == -1)
11079 return NULL;
11080
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011081 if (left == right) {
11082 switch (op) {
11083 case Py_EQ:
11084 case Py_LE:
11085 case Py_GE:
11086 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011087 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011088 case Py_NE:
11089 case Py_LT:
11090 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011091 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011092 default:
11093 PyErr_BadArgument();
11094 return NULL;
11095 }
11096 }
11097 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011098 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011099 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011100 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011101 }
11102 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011103 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011104 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011105 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011106}
11107
Alexander Belopolsky40018472011-02-26 01:02:56 +000011108int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011109_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11110{
11111 return unicode_eq(aa, bb);
11112}
11113
11114int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011115PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011116{
Victor Stinner77282cb2013-04-14 19:22:47 +020011117 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 void *buf1, *buf2;
11119 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011120 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011121
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011122 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011124 "'in <string>' requires string as left operand, not %.100s",
11125 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011126 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011127 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011128 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011129 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011130 if (ensure_unicode(str) < 0)
11131 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011134 kind2 = PyUnicode_KIND(substr);
11135 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011136 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011138 len2 = PyUnicode_GET_LENGTH(substr);
11139 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011140 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011141 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011143 if (len2 == 1) {
11144 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11145 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011146 return result;
11147 }
11148 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 buf2 = _PyUnicode_AsKind(substr, kind1);
11150 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011151 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153
Victor Stinner77282cb2013-04-14 19:22:47 +020011154 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 case PyUnicode_1BYTE_KIND:
11156 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11157 break;
11158 case PyUnicode_2BYTE_KIND:
11159 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11160 break;
11161 case PyUnicode_4BYTE_KIND:
11162 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11163 break;
11164 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011165 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167
Victor Stinner77282cb2013-04-14 19:22:47 +020011168 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 PyMem_Free(buf2);
11170
Guido van Rossum403d68b2000-03-13 15:55:09 +000011171 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011172}
11173
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174/* Concat to string or Unicode object giving a new Unicode object. */
11175
Alexander Belopolsky40018472011-02-26 01:02:56 +000011176PyObject *
11177PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011179 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011180 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011181 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011183 if (ensure_unicode(left) < 0)
11184 return NULL;
11185
11186 if (!PyUnicode_Check(right)) {
11187 PyErr_Format(PyExc_TypeError,
11188 "can only concatenate str (not \"%.200s\") to str",
11189 right->ob_type->tp_name);
11190 return NULL;
11191 }
11192 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
11195 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011196 if (left == unicode_empty)
11197 return PyUnicode_FromObject(right);
11198 if (right == unicode_empty)
11199 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011201 left_len = PyUnicode_GET_LENGTH(left);
11202 right_len = PyUnicode_GET_LENGTH(right);
11203 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011204 PyErr_SetString(PyExc_OverflowError,
11205 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011206 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011207 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011209
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011210 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11211 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011212 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 result = PyUnicode_New(new_len, maxchar);
11216 if (result == NULL)
11217 return NULL;
11218 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11219 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11220 assert(_PyUnicode_CheckConsistency(result, 1));
11221 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222}
11223
Walter Dörwald1ab83302007-05-18 17:15:44 +000011224void
Victor Stinner23e56682011-10-03 03:54:37 +020011225PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011226{
Victor Stinner23e56682011-10-03 03:54:37 +020011227 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011228 Py_UCS4 maxchar, maxchar2;
11229 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011230
11231 if (p_left == NULL) {
11232 if (!PyErr_Occurred())
11233 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011234 return;
11235 }
Victor Stinner23e56682011-10-03 03:54:37 +020011236 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011237 if (right == NULL || left == NULL
11238 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011239 if (!PyErr_Occurred())
11240 PyErr_BadInternalCall();
11241 goto error;
11242 }
11243
Benjamin Petersonbac79492012-01-14 13:34:47 -050011244 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011245 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011246 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011247 goto error;
11248
Victor Stinner488fa492011-12-12 00:01:39 +010011249 /* Shortcuts */
11250 if (left == unicode_empty) {
11251 Py_DECREF(left);
11252 Py_INCREF(right);
11253 *p_left = right;
11254 return;
11255 }
11256 if (right == unicode_empty)
11257 return;
11258
11259 left_len = PyUnicode_GET_LENGTH(left);
11260 right_len = PyUnicode_GET_LENGTH(right);
11261 if (left_len > PY_SSIZE_T_MAX - right_len) {
11262 PyErr_SetString(PyExc_OverflowError,
11263 "strings are too large to concat");
11264 goto error;
11265 }
11266 new_len = left_len + right_len;
11267
11268 if (unicode_modifiable(left)
11269 && PyUnicode_CheckExact(right)
11270 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011271 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11272 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011273 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011274 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011275 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11276 {
11277 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011278 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011279 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011280
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011281 /* copy 'right' into the newly allocated area of 'left' */
11282 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011283 }
Victor Stinner488fa492011-12-12 00:01:39 +010011284 else {
11285 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11286 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011287 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011288
Victor Stinner488fa492011-12-12 00:01:39 +010011289 /* Concat the two Unicode strings */
11290 res = PyUnicode_New(new_len, maxchar);
11291 if (res == NULL)
11292 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011293 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11294 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011295 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011296 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011297 }
11298 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011299 return;
11300
11301error:
Victor Stinner488fa492011-12-12 00:01:39 +010011302 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011303}
11304
11305void
11306PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011308 PyUnicode_Append(pleft, right);
11309 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011310}
11311
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011312/*
11313Wraps stringlib_parse_args_finds() and additionally ensures that the
11314first argument is a unicode object.
11315*/
11316
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011317static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011318parse_args_finds_unicode(const char * function_name, PyObject *args,
11319 PyObject **substring,
11320 Py_ssize_t *start, Py_ssize_t *end)
11321{
11322 if(stringlib_parse_args_finds(function_name, args, substring,
11323 start, end)) {
11324 if (ensure_unicode(*substring) < 0)
11325 return 0;
11326 return 1;
11327 }
11328 return 0;
11329}
11330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011331PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011334Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011335string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
11338static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011339unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011341 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011342 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011343 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011345 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 void *buf1, *buf2;
11347 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011349 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 kind1 = PyUnicode_KIND(self);
11353 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011354 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011355 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 len1 = PyUnicode_GET_LENGTH(self);
11358 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011360 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011361 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011362
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011363 buf1 = PyUnicode_DATA(self);
11364 buf2 = PyUnicode_DATA(substring);
11365 if (kind2 != kind1) {
11366 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011367 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011368 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011369 }
11370 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 case PyUnicode_1BYTE_KIND:
11372 iresult = ucs1lib_count(
11373 ((Py_UCS1*)buf1) + start, end - start,
11374 buf2, len2, PY_SSIZE_T_MAX
11375 );
11376 break;
11377 case PyUnicode_2BYTE_KIND:
11378 iresult = ucs2lib_count(
11379 ((Py_UCS2*)buf1) + start, end - start,
11380 buf2, len2, PY_SSIZE_T_MAX
11381 );
11382 break;
11383 case PyUnicode_4BYTE_KIND:
11384 iresult = ucs4lib_count(
11385 ((Py_UCS4*)buf1) + start, end - start,
11386 buf2, len2, PY_SSIZE_T_MAX
11387 );
11388 break;
11389 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011390 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 }
11392
11393 result = PyLong_FromSsize_t(iresult);
11394
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011395 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 return result;
11399}
11400
INADA Naoki3ae20562017-01-16 20:41:20 +090011401/*[clinic input]
11402str.encode as unicode_encode
11403
11404 encoding: str(c_default="NULL") = 'utf-8'
11405 The encoding in which to encode the string.
11406 errors: str(c_default="NULL") = 'strict'
11407 The error handling scheme to use for encoding errors.
11408 The default is 'strict' meaning that encoding errors raise a
11409 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11410 'xmlcharrefreplace' as well as any other name registered with
11411 codecs.register_error that can handle UnicodeEncodeErrors.
11412
11413Encode the string using the codec registered for encoding.
11414[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011417unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011418/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011420 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011421}
11422
INADA Naoki3ae20562017-01-16 20:41:20 +090011423/*[clinic input]
11424str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
INADA Naoki3ae20562017-01-16 20:41:20 +090011426 tabsize: int = 8
11427
11428Return a copy where all tab characters are expanded using spaces.
11429
11430If tabsize is not given, a tab size of 8 characters is assumed.
11431[clinic start generated code]*/
11432
11433static PyObject *
11434unicode_expandtabs_impl(PyObject *self, int tabsize)
11435/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011437 Py_ssize_t i, j, line_pos, src_len, incr;
11438 Py_UCS4 ch;
11439 PyObject *u;
11440 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011441 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011442 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Antoine Pitrou22425222011-10-04 19:10:51 +020011444 if (PyUnicode_READY(self) == -1)
11445 return NULL;
11446
Thomas Wouters7e474022000-07-16 12:04:32 +000011447 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011448 src_len = PyUnicode_GET_LENGTH(self);
11449 i = j = line_pos = 0;
11450 kind = PyUnicode_KIND(self);
11451 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011452 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011453 for (; i < src_len; i++) {
11454 ch = PyUnicode_READ(kind, src_data, i);
11455 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011456 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011460 goto overflow;
11461 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011463 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 goto overflow;
11468 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011470 if (ch == '\n' || ch == '\r')
11471 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011474 if (!found)
11475 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 if (!u)
11480 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 for (; i < src_len; i++) {
11486 ch = PyUnicode_READ(kind, src_data, i);
11487 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011489 incr = tabsize - (line_pos % tabsize);
11490 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011491 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011492 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011494 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 line_pos++;
11497 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011498 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 if (ch == '\n' || ch == '\r')
11500 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011502 }
11503 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011504 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011505
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011507 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509}
11510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011511PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513\n\
11514Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011515such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516arguments start and end are interpreted as in slice notation.\n\
11517\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
11520static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011523 /* initialize variables to prevent gcc warning */
11524 PyObject *substring = NULL;
11525 Py_ssize_t start = 0;
11526 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011529 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011532 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011535 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (result == -2)
11538 return NULL;
11539
Christian Heimes217cfd12007-12-02 14:31:20 +000011540 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
11543static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011544unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011546 void *data;
11547 enum PyUnicode_Kind kind;
11548 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011549
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011550 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011551 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011553 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011554 if (PyUnicode_READY(self) == -1) {
11555 return NULL;
11556 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011557 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11558 PyErr_SetString(PyExc_IndexError, "string index out of range");
11559 return NULL;
11560 }
11561 kind = PyUnicode_KIND(self);
11562 data = PyUnicode_DATA(self);
11563 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011564 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
Guido van Rossumc2504932007-09-18 19:42:40 +000011567/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011568 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011569static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011570unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011572 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011573
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011574#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011575 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011576#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (_PyUnicode_HASH(self) != -1)
11578 return _PyUnicode_HASH(self);
11579 if (PyUnicode_READY(self) == -1)
11580 return -1;
animalizea1d14252019-01-02 20:16:06 +080011581
Christian Heimes985ecdc2013-11-20 11:46:18 +010011582 x = _Py_HashBytes(PyUnicode_DATA(self),
11583 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011585 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586}
11587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011588PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590\n\
oldkaa0735f2018-02-02 16:52:55 +080011591Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011592such that sub is contained within S[start:end]. Optional\n\
11593arguments start and end are interpreted as in slice notation.\n\
11594\n\
11595Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596
11597static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011600 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011601 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011602 PyObject *substring = NULL;
11603 Py_ssize_t start = 0;
11604 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011606 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011609 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011612 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (result == -2)
11615 return NULL;
11616
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 if (result < 0) {
11618 PyErr_SetString(PyExc_ValueError, "substring not found");
11619 return NULL;
11620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621
Christian Heimes217cfd12007-12-02 14:31:20 +000011622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
INADA Naoki3ae20562017-01-16 20:41:20 +090011625/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011626str.isascii as unicode_isascii
11627
11628Return True if all characters in the string are ASCII, False otherwise.
11629
11630ASCII characters have code points in the range U+0000-U+007F.
11631Empty string is ASCII too.
11632[clinic start generated code]*/
11633
11634static PyObject *
11635unicode_isascii_impl(PyObject *self)
11636/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11637{
11638 if (PyUnicode_READY(self) == -1) {
11639 return NULL;
11640 }
11641 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11642}
11643
11644/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011645str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
INADA Naoki3ae20562017-01-16 20:41:20 +090011647Return True if the string is a lowercase string, False otherwise.
11648
11649A string is lowercase if all cased characters in the string are lowercase and
11650there is at least one cased character in the string.
11651[clinic start generated code]*/
11652
11653static PyObject *
11654unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011655/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 Py_ssize_t i, length;
11658 int kind;
11659 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 int cased;
11661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664 length = PyUnicode_GET_LENGTH(self);
11665 kind = PyUnicode_KIND(self);
11666 data = PyUnicode_DATA(self);
11667
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (length == 1)
11670 return PyBool_FromLong(
11671 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011673 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011675 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011676
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 for (i = 0; i < length; i++) {
11679 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011680
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011682 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 else if (!cased && Py_UNICODE_ISLOWER(ch))
11684 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011686 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687}
11688
INADA Naoki3ae20562017-01-16 20:41:20 +090011689/*[clinic input]
11690str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
INADA Naoki3ae20562017-01-16 20:41:20 +090011692Return True if the string is an uppercase string, False otherwise.
11693
11694A string is uppercase if all cased characters in the string are uppercase and
11695there is at least one cased character in the string.
11696[clinic start generated code]*/
11697
11698static PyObject *
11699unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011700/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 Py_ssize_t i, length;
11703 int kind;
11704 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 int cased;
11706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (PyUnicode_READY(self) == -1)
11708 return NULL;
11709 length = PyUnicode_GET_LENGTH(self);
11710 kind = PyUnicode_KIND(self);
11711 data = PyUnicode_DATA(self);
11712
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (length == 1)
11715 return PyBool_FromLong(
11716 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011718 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011720 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011721
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 for (i = 0; i < length; i++) {
11724 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011725
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011727 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 else if (!cased && Py_UNICODE_ISUPPER(ch))
11729 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011731 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732}
11733
INADA Naoki3ae20562017-01-16 20:41:20 +090011734/*[clinic input]
11735str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736
INADA Naoki3ae20562017-01-16 20:41:20 +090011737Return True if the string is a title-cased string, False otherwise.
11738
11739In a title-cased string, upper- and title-case characters may only
11740follow uncased characters and lowercase characters only cased ones.
11741[clinic start generated code]*/
11742
11743static PyObject *
11744unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011745/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 Py_ssize_t i, length;
11748 int kind;
11749 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 int cased, previous_is_cased;
11751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (PyUnicode_READY(self) == -1)
11753 return NULL;
11754 length = PyUnicode_GET_LENGTH(self);
11755 kind = PyUnicode_KIND(self);
11756 data = PyUnicode_DATA(self);
11757
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (length == 1) {
11760 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11761 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11762 (Py_UNICODE_ISUPPER(ch) != 0));
11763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011765 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011767 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011768
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 cased = 0;
11770 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 for (i = 0; i < length; i++) {
11772 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011773
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11775 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011776 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 previous_is_cased = 1;
11778 cased = 1;
11779 }
11780 else if (Py_UNICODE_ISLOWER(ch)) {
11781 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011782 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 previous_is_cased = 1;
11784 cased = 1;
11785 }
11786 else
11787 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011789 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
INADA Naoki3ae20562017-01-16 20:41:20 +090011792/*[clinic input]
11793str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
INADA Naoki3ae20562017-01-16 20:41:20 +090011795Return True if the string is a whitespace string, False otherwise.
11796
11797A string is whitespace if all characters in the string are whitespace and there
11798is at least one character in the string.
11799[clinic start generated code]*/
11800
11801static PyObject *
11802unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011803/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 Py_ssize_t i, length;
11806 int kind;
11807 void *data;
11808
11809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811 length = PyUnicode_GET_LENGTH(self);
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (length == 1)
11817 return PyBool_FromLong(
11818 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011820 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011822 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 for (i = 0; i < length; i++) {
11825 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011826 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011827 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011829 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
INADA Naoki3ae20562017-01-16 20:41:20 +090011832/*[clinic input]
11833str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834
INADA Naoki3ae20562017-01-16 20:41:20 +090011835Return True if the string is an alphabetic string, False otherwise.
11836
11837A string is alphabetic if all characters in the string are alphabetic and there
11838is at least one character in the string.
11839[clinic start generated code]*/
11840
11841static PyObject *
11842unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011843/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 Py_ssize_t i, length;
11846 int kind;
11847 void *data;
11848
11849 if (PyUnicode_READY(self) == -1)
11850 return NULL;
11851 length = PyUnicode_GET_LENGTH(self);
11852 kind = PyUnicode_KIND(self);
11853 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011854
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011855 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 if (length == 1)
11857 return PyBool_FromLong(
11858 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859
11860 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 for (i = 0; i < length; i++) {
11865 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011866 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011867 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011868 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869}
11870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871/*[clinic input]
11872str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011873
INADA Naoki3ae20562017-01-16 20:41:20 +090011874Return True if the string is an alpha-numeric string, False otherwise.
11875
11876A string is alpha-numeric if all characters in the string are alpha-numeric and
11877there is at least one character in the string.
11878[clinic start generated code]*/
11879
11880static PyObject *
11881unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011882/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 int kind;
11885 void *data;
11886 Py_ssize_t len, i;
11887
11888 if (PyUnicode_READY(self) == -1)
11889 return NULL;
11890
11891 kind = PyUnicode_KIND(self);
11892 data = PyUnicode_DATA(self);
11893 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011894
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011895 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (len == 1) {
11897 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11898 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11899 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011900
11901 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 for (i = 0; i < len; i++) {
11906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011907 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011908 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011909 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011910 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011911}
11912
INADA Naoki3ae20562017-01-16 20:41:20 +090011913/*[clinic input]
11914str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
INADA Naoki3ae20562017-01-16 20:41:20 +090011916Return True if the string is a decimal string, False otherwise.
11917
11918A string is a decimal string if all characters in the string are decimal and
11919there is at least one character in the string.
11920[clinic start generated code]*/
11921
11922static PyObject *
11923unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011924/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 Py_ssize_t i, length;
11927 int kind;
11928 void *data;
11929
11930 if (PyUnicode_READY(self) == -1)
11931 return NULL;
11932 length = PyUnicode_GET_LENGTH(self);
11933 kind = PyUnicode_KIND(self);
11934 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 1)
11938 return PyBool_FromLong(
11939 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011941 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 for (i = 0; i < length; i++) {
11946 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011947 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011949 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950}
11951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952/*[clinic input]
11953str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
INADA Naoki3ae20562017-01-16 20:41:20 +090011955Return True if the string is a digit string, False otherwise.
11956
11957A string is a digit string if all characters in the string are digits and there
11958is at least one character in the string.
11959[clinic start generated code]*/
11960
11961static PyObject *
11962unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011963/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 Py_ssize_t i, length;
11966 int kind;
11967 void *data;
11968
11969 if (PyUnicode_READY(self) == -1)
11970 return NULL;
11971 length = PyUnicode_GET_LENGTH(self);
11972 kind = PyUnicode_KIND(self);
11973 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (length == 1) {
11977 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11978 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011981 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011983 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 for (i = 0; i < length; i++) {
11986 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011987 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990}
11991
INADA Naoki3ae20562017-01-16 20:41:20 +090011992/*[clinic input]
11993str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
INADA Naoki3ae20562017-01-16 20:41:20 +090011995Return True if the string is a numeric string, False otherwise.
11996
11997A string is numeric if all characters in the string are numeric and there is at
11998least one character in the string.
11999[clinic start generated code]*/
12000
12001static PyObject *
12002unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012003/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 Py_ssize_t i, length;
12006 int kind;
12007 void *data;
12008
12009 if (PyUnicode_READY(self) == -1)
12010 return NULL;
12011 length = PyUnicode_GET_LENGTH(self);
12012 kind = PyUnicode_KIND(self);
12013 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (length == 1)
12017 return PyBool_FromLong(
12018 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012020 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 for (i = 0; i < length; i++) {
12025 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
Martin v. Löwis47383402007-08-15 07:32:56 +000012031int
12032PyUnicode_IsIdentifier(PyObject *self)
12033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 int kind;
12035 void *data;
12036 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012037 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (PyUnicode_READY(self) == -1) {
12040 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 }
12043
12044 /* Special case for empty strings */
12045 if (PyUnicode_GET_LENGTH(self) == 0)
12046 return 0;
12047 kind = PyUnicode_KIND(self);
12048 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012049
12050 /* PEP 3131 says that the first character must be in
12051 XID_Start and subsequent characters in XID_Continue,
12052 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012053 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012054 letters, digits, underscore). However, given the current
12055 definition of XID_Start and XID_Continue, it is sufficient
12056 to check just for these, except that _ must be allowed
12057 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012059 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012060 return 0;
12061
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012062 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012065 return 1;
12066}
12067
INADA Naoki3ae20562017-01-16 20:41:20 +090012068/*[clinic input]
12069str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012070
INADA Naoki3ae20562017-01-16 20:41:20 +090012071Return True if the string is a valid Python identifier, False otherwise.
12072
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012073Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012074such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012075[clinic start generated code]*/
12076
12077static PyObject *
12078unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012079/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012080{
12081 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12082}
12083
INADA Naoki3ae20562017-01-16 20:41:20 +090012084/*[clinic input]
12085str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012086
INADA Naoki3ae20562017-01-16 20:41:20 +090012087Return True if the string is printable, False otherwise.
12088
12089A string is printable if all of its characters are considered printable in
12090repr() or if it is empty.
12091[clinic start generated code]*/
12092
12093static PyObject *
12094unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012095/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 Py_ssize_t i, length;
12098 int kind;
12099 void *data;
12100
12101 if (PyUnicode_READY(self) == -1)
12102 return NULL;
12103 length = PyUnicode_GET_LENGTH(self);
12104 kind = PyUnicode_KIND(self);
12105 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012106
12107 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (length == 1)
12109 return PyBool_FromLong(
12110 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 for (i = 0; i < length; i++) {
12113 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012114 Py_RETURN_FALSE;
12115 }
12116 }
12117 Py_RETURN_TRUE;
12118}
12119
INADA Naoki3ae20562017-01-16 20:41:20 +090012120/*[clinic input]
12121str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
INADA Naoki3ae20562017-01-16 20:41:20 +090012123 iterable: object
12124 /
12125
12126Concatenate any number of strings.
12127
Martin Panter91a88662017-01-24 00:30:06 +000012128The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012129The result is returned as a new string.
12130
12131Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12132[clinic start generated code]*/
12133
12134static PyObject *
12135unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012136/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137{
INADA Naoki3ae20562017-01-16 20:41:20 +090012138 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
Martin v. Löwis18e16552006-02-15 17:27:45 +000012141static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012142unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (PyUnicode_READY(self) == -1)
12145 return -1;
12146 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147}
12148
INADA Naoki3ae20562017-01-16 20:41:20 +090012149/*[clinic input]
12150str.ljust as unicode_ljust
12151
12152 width: Py_ssize_t
12153 fillchar: Py_UCS4 = ' '
12154 /
12155
12156Return a left-justified string of length width.
12157
12158Padding is done using the specified fill character (default is a space).
12159[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
12161static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012162unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12163/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012165 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
Victor Stinnerc4b49542011-12-11 22:44:26 +010012168 if (PyUnicode_GET_LENGTH(self) >= width)
12169 return unicode_result_unchanged(self);
12170
12171 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172}
12173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174/*[clinic input]
12175str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177Return a copy of the string converted to lowercase.
12178[clinic start generated code]*/
12179
12180static PyObject *
12181unicode_lower_impl(PyObject *self)
12182/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012184 if (PyUnicode_READY(self) == -1)
12185 return NULL;
12186 if (PyUnicode_IS_ASCII(self))
12187 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012188 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189}
12190
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012191#define LEFTSTRIP 0
12192#define RIGHTSTRIP 1
12193#define BOTHSTRIP 2
12194
12195/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012196static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012199
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012200/* externally visible for str.strip(unicode) */
12201PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012202_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 void *data;
12205 int kind;
12206 Py_ssize_t i, j, len;
12207 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012208 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12211 return NULL;
12212
12213 kind = PyUnicode_KIND(self);
12214 data = PyUnicode_DATA(self);
12215 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012216 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12218 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012219 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220
Benjamin Peterson14339b62009-01-31 16:36:08 +000012221 i = 0;
12222 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012223 while (i < len) {
12224 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12225 if (!BLOOM(sepmask, ch))
12226 break;
12227 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12228 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 i++;
12230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012231 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012232
Benjamin Peterson14339b62009-01-31 16:36:08 +000012233 j = len;
12234 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012235 j--;
12236 while (j >= i) {
12237 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12238 if (!BLOOM(sepmask, ch))
12239 break;
12240 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12241 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012243 }
12244
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012246 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012247
Victor Stinner7931d9a2011-11-04 00:22:48 +010012248 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249}
12250
12251PyObject*
12252PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12253{
12254 unsigned char *data;
12255 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012256 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257
Victor Stinnerde636f32011-10-01 03:55:54 +020012258 if (PyUnicode_READY(self) == -1)
12259 return NULL;
12260
Victor Stinner684d5fd2012-05-03 02:32:34 +020012261 length = PyUnicode_GET_LENGTH(self);
12262 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012263
Victor Stinner684d5fd2012-05-03 02:32:34 +020012264 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012265 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266
Victor Stinnerde636f32011-10-01 03:55:54 +020012267 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012268 PyErr_SetString(PyExc_IndexError, "string index out of range");
12269 return NULL;
12270 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012271 if (start >= length || end < start)
12272 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012273
Victor Stinner684d5fd2012-05-03 02:32:34 +020012274 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012275 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012276 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012277 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012278 }
12279 else {
12280 kind = PyUnicode_KIND(self);
12281 data = PyUnicode_1BYTE_DATA(self);
12282 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012283 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012284 length);
12285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
12288static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012289do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 Py_ssize_t len, i, j;
12292
12293 if (PyUnicode_READY(self) == -1)
12294 return NULL;
12295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297
Victor Stinnercc7af722013-04-09 22:39:24 +020012298 if (PyUnicode_IS_ASCII(self)) {
12299 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12300
12301 i = 0;
12302 if (striptype != RIGHTSTRIP) {
12303 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012304 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012305 if (!_Py_ascii_whitespace[ch])
12306 break;
12307 i++;
12308 }
12309 }
12310
12311 j = len;
12312 if (striptype != LEFTSTRIP) {
12313 j--;
12314 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012315 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012316 if (!_Py_ascii_whitespace[ch])
12317 break;
12318 j--;
12319 }
12320 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 }
12322 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012323 else {
12324 int kind = PyUnicode_KIND(self);
12325 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326
Victor Stinnercc7af722013-04-09 22:39:24 +020012327 i = 0;
12328 if (striptype != RIGHTSTRIP) {
12329 while (i < len) {
12330 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12331 if (!Py_UNICODE_ISSPACE(ch))
12332 break;
12333 i++;
12334 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012335 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012336
12337 j = len;
12338 if (striptype != LEFTSTRIP) {
12339 j--;
12340 while (j >= i) {
12341 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12342 if (!Py_UNICODE_ISSPACE(ch))
12343 break;
12344 j--;
12345 }
12346 j++;
12347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349
Victor Stinner7931d9a2011-11-04 00:22:48 +010012350 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353
12354static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012355do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012356{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012357 if (sep != NULL && sep != Py_None) {
12358 if (PyUnicode_Check(sep))
12359 return _PyUnicode_XStrip(self, striptype, sep);
12360 else {
12361 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 "%s arg must be None or str",
12363 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012364 return NULL;
12365 }
12366 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367
Benjamin Peterson14339b62009-01-31 16:36:08 +000012368 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369}
12370
12371
INADA Naoki3ae20562017-01-16 20:41:20 +090012372/*[clinic input]
12373str.strip as unicode_strip
12374
12375 chars: object = None
12376 /
12377
Victor Stinner0c4a8282017-01-17 02:21:47 +010012378Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012379
12380If chars is given and not None, remove characters in chars instead.
12381[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012382
12383static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012384unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012385/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386{
INADA Naoki3ae20562017-01-16 20:41:20 +090012387 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012388}
12389
12390
INADA Naoki3ae20562017-01-16 20:41:20 +090012391/*[clinic input]
12392str.lstrip as unicode_lstrip
12393
12394 chars: object = NULL
12395 /
12396
12397Return a copy of the string with leading whitespace removed.
12398
12399If chars is given and not None, remove characters in chars instead.
12400[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401
12402static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012403unicode_lstrip_impl(PyObject *self, PyObject *chars)
12404/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012405{
INADA Naoki3ae20562017-01-16 20:41:20 +090012406 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012407}
12408
12409
INADA Naoki3ae20562017-01-16 20:41:20 +090012410/*[clinic input]
12411str.rstrip as unicode_rstrip
12412
12413 chars: object = NULL
12414 /
12415
12416Return a copy of the string with trailing whitespace removed.
12417
12418If chars is given and not None, remove characters in chars instead.
12419[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420
12421static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012422unicode_rstrip_impl(PyObject *self, PyObject *chars)
12423/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012424{
INADA Naoki3ae20562017-01-16 20:41:20 +090012425 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012426}
12427
12428
Guido van Rossumd57fd912000-03-10 22:53:23 +000012429static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012430unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012432 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434
Serhiy Storchaka05997252013-01-26 12:14:02 +020012435 if (len < 1)
12436 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437
Victor Stinnerc4b49542011-12-11 22:44:26 +010012438 /* no repeat, return original string */
12439 if (len == 1)
12440 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012441
Benjamin Petersonbac79492012-01-14 13:34:47 -050012442 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 return NULL;
12444
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012445 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012446 PyErr_SetString(PyExc_OverflowError,
12447 "repeated string is too long");
12448 return NULL;
12449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012451
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012452 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453 if (!u)
12454 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012455 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (PyUnicode_GET_LENGTH(str) == 1) {
12458 const int kind = PyUnicode_KIND(str);
12459 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012460 if (kind == PyUnicode_1BYTE_KIND) {
12461 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012462 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012463 }
12464 else if (kind == PyUnicode_2BYTE_KIND) {
12465 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012466 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012467 ucs2[n] = fill_char;
12468 } else {
12469 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12470 assert(kind == PyUnicode_4BYTE_KIND);
12471 for (n = 0; n < len; ++n)
12472 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 }
12475 else {
12476 /* number of characters copied this far */
12477 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012478 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012480 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012484 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012485 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487 }
12488
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012489 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012490 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491}
12492
Alexander Belopolsky40018472011-02-26 01:02:56 +000012493PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012494PyUnicode_Replace(PyObject *str,
12495 PyObject *substr,
12496 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012497 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012499 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12500 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012502 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503}
12504
INADA Naoki3ae20562017-01-16 20:41:20 +090012505/*[clinic input]
12506str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
INADA Naoki3ae20562017-01-16 20:41:20 +090012508 old: unicode
12509 new: unicode
12510 count: Py_ssize_t = -1
12511 Maximum number of occurrences to replace.
12512 -1 (the default value) means replace all occurrences.
12513 /
12514
12515Return a copy with all occurrences of substring old replaced by new.
12516
12517If the optional argument count is given, only the first count occurrences are
12518replaced.
12519[clinic start generated code]*/
12520
12521static PyObject *
12522unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12523 Py_ssize_t count)
12524/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012526 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012528 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
Alexander Belopolsky40018472011-02-26 01:02:56 +000012531static PyObject *
12532unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012534 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 Py_ssize_t isize;
12536 Py_ssize_t osize, squote, dquote, i, o;
12537 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012538 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012542 return NULL;
12543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 isize = PyUnicode_GET_LENGTH(unicode);
12545 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 /* Compute length of output, quote characters, and
12548 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012549 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 max = 127;
12551 squote = dquote = 0;
12552 ikind = PyUnicode_KIND(unicode);
12553 for (i = 0; i < isize; i++) {
12554 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012555 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012557 case '\'': squote++; break;
12558 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012560 incr = 2;
12561 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 default:
12563 /* Fast-path ASCII */
12564 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012565 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012567 ;
12568 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012571 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012573 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012575 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012577 if (osize > PY_SSIZE_T_MAX - incr) {
12578 PyErr_SetString(PyExc_OverflowError,
12579 "string is too long to generate repr");
12580 return NULL;
12581 }
12582 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 }
12584
12585 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012586 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012588 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 if (dquote)
12590 /* Both squote and dquote present. Use squote,
12591 and escape them */
12592 osize += squote;
12593 else
12594 quote = '"';
12595 }
Victor Stinner55c08782013-04-14 18:45:39 +020012596 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597
12598 repr = PyUnicode_New(osize, max);
12599 if (repr == NULL)
12600 return NULL;
12601 okind = PyUnicode_KIND(repr);
12602 odata = PyUnicode_DATA(repr);
12603
12604 PyUnicode_WRITE(okind, odata, 0, quote);
12605 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012606 if (unchanged) {
12607 _PyUnicode_FastCopyCharacters(repr, 1,
12608 unicode, 0,
12609 isize);
12610 }
12611 else {
12612 for (i = 0, o = 1; i < isize; i++) {
12613 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614
Victor Stinner55c08782013-04-14 18:45:39 +020012615 /* Escape quotes and backslashes */
12616 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012617 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012619 continue;
12620 }
12621
12622 /* Map special whitespace to '\t', \n', '\r' */
12623 if (ch == '\t') {
12624 PyUnicode_WRITE(okind, odata, o++, '\\');
12625 PyUnicode_WRITE(okind, odata, o++, 't');
12626 }
12627 else if (ch == '\n') {
12628 PyUnicode_WRITE(okind, odata, o++, '\\');
12629 PyUnicode_WRITE(okind, odata, o++, 'n');
12630 }
12631 else if (ch == '\r') {
12632 PyUnicode_WRITE(okind, odata, o++, '\\');
12633 PyUnicode_WRITE(okind, odata, o++, 'r');
12634 }
12635
12636 /* Map non-printable US ASCII to '\xhh' */
12637 else if (ch < ' ' || ch == 0x7F) {
12638 PyUnicode_WRITE(okind, odata, o++, '\\');
12639 PyUnicode_WRITE(okind, odata, o++, 'x');
12640 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12641 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12642 }
12643
12644 /* Copy ASCII characters as-is */
12645 else if (ch < 0x7F) {
12646 PyUnicode_WRITE(okind, odata, o++, ch);
12647 }
12648
12649 /* Non-ASCII characters */
12650 else {
12651 /* Map Unicode whitespace and control characters
12652 (categories Z* and C* except ASCII space)
12653 */
12654 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12655 PyUnicode_WRITE(okind, odata, o++, '\\');
12656 /* Map 8-bit characters to '\xhh' */
12657 if (ch <= 0xff) {
12658 PyUnicode_WRITE(okind, odata, o++, 'x');
12659 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12660 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12661 }
12662 /* Map 16-bit characters to '\uxxxx' */
12663 else if (ch <= 0xffff) {
12664 PyUnicode_WRITE(okind, odata, o++, 'u');
12665 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12666 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12667 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12668 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12669 }
12670 /* Map 21-bit characters to '\U00xxxxxx' */
12671 else {
12672 PyUnicode_WRITE(okind, odata, o++, 'U');
12673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12676 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12677 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12678 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12679 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12680 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12681 }
12682 }
12683 /* Copy characters as-is */
12684 else {
12685 PyUnicode_WRITE(okind, odata, o++, ch);
12686 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012687 }
12688 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012691 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012692 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693}
12694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012695PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697\n\
12698Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012699such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700arguments start and end are interpreted as in slice notation.\n\
12701\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012707 /* initialize variables to prevent gcc warning */
12708 PyObject *substring = NULL;
12709 Py_ssize_t start = 0;
12710 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012713 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012716 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012719 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 if (result == -2)
12722 return NULL;
12723
Christian Heimes217cfd12007-12-02 14:31:20 +000012724 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725}
12726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012727PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012730Return the highest index in S where substring sub is found,\n\
12731such that sub is contained within S[start:end]. Optional\n\
12732arguments start and end are interpreted as in slice notation.\n\
12733\n\
12734Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735
12736static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012739 /* initialize variables to prevent gcc warning */
12740 PyObject *substring = NULL;
12741 Py_ssize_t start = 0;
12742 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012745 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012748 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012751 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (result == -2)
12754 return NULL;
12755
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756 if (result < 0) {
12757 PyErr_SetString(PyExc_ValueError, "substring not found");
12758 return NULL;
12759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760
Christian Heimes217cfd12007-12-02 14:31:20 +000012761 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
INADA Naoki3ae20562017-01-16 20:41:20 +090012764/*[clinic input]
12765str.rjust as unicode_rjust
12766
12767 width: Py_ssize_t
12768 fillchar: Py_UCS4 = ' '
12769 /
12770
12771Return a right-justified string of length width.
12772
12773Padding is done using the specified fill character (default is a space).
12774[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775
12776static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012777unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12778/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012780 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 return NULL;
12782
Victor Stinnerc4b49542011-12-11 22:44:26 +010012783 if (PyUnicode_GET_LENGTH(self) >= width)
12784 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785
Victor Stinnerc4b49542011-12-11 22:44:26 +010012786 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787}
12788
Alexander Belopolsky40018472011-02-26 01:02:56 +000012789PyObject *
12790PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012792 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012795 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
INADA Naoki3ae20562017-01-16 20:41:20 +090012798/*[clinic input]
12799str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800
INADA Naoki3ae20562017-01-16 20:41:20 +090012801 sep: object = None
12802 The delimiter according which to split the string.
12803 None (the default value) means split according to any whitespace,
12804 and discard empty strings from the result.
12805 maxsplit: Py_ssize_t = -1
12806 Maximum number of splits to do.
12807 -1 (the default value) means no limit.
12808
12809Return a list of the words in the string, using sep as the delimiter string.
12810[clinic start generated code]*/
12811
12812static PyObject *
12813unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12814/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815{
INADA Naoki3ae20562017-01-16 20:41:20 +090012816 if (sep == Py_None)
12817 return split(self, NULL, maxsplit);
12818 if (PyUnicode_Check(sep))
12819 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012820
Victor Stinner998b8062018-09-12 00:23:25 +020012821 PyErr_Format(PyExc_TypeError,
12822 "must be str or None, not %.100s",
12823 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825}
12826
Thomas Wouters477c8d52006-05-27 19:21:47 +000012827PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012828PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012831 int kind1, kind2;
12832 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012835 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012837
Victor Stinner14f8f022011-10-05 20:58:25 +020012838 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 len1 = PyUnicode_GET_LENGTH(str_obj);
12841 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012842 if (kind1 < kind2 || len1 < len2) {
12843 _Py_INCREF_UNICODE_EMPTY();
12844 if (!unicode_empty)
12845 out = NULL;
12846 else {
12847 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12848 Py_DECREF(unicode_empty);
12849 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012850 return out;
12851 }
12852 buf1 = PyUnicode_DATA(str_obj);
12853 buf2 = PyUnicode_DATA(sep_obj);
12854 if (kind2 != kind1) {
12855 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12856 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012857 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012860 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012862 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12863 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12864 else
12865 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 break;
12867 case PyUnicode_2BYTE_KIND:
12868 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12869 break;
12870 case PyUnicode_4BYTE_KIND:
12871 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12872 break;
12873 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012874 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012876
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012877 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012879
12880 return out;
12881}
12882
12883
12884PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012886{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012888 int kind1, kind2;
12889 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012895 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 len1 = PyUnicode_GET_LENGTH(str_obj);
12898 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012899 if (kind1 < kind2 || len1 < len2) {
12900 _Py_INCREF_UNICODE_EMPTY();
12901 if (!unicode_empty)
12902 out = NULL;
12903 else {
12904 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12905 Py_DECREF(unicode_empty);
12906 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012907 return out;
12908 }
12909 buf1 = PyUnicode_DATA(str_obj);
12910 buf2 = PyUnicode_DATA(sep_obj);
12911 if (kind2 != kind1) {
12912 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12913 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012914 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012917 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012919 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12920 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12921 else
12922 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 break;
12924 case PyUnicode_2BYTE_KIND:
12925 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12926 break;
12927 case PyUnicode_4BYTE_KIND:
12928 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12929 break;
12930 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012931 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936
12937 return out;
12938}
12939
INADA Naoki3ae20562017-01-16 20:41:20 +090012940/*[clinic input]
12941str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012942
INADA Naoki3ae20562017-01-16 20:41:20 +090012943 sep: object
12944 /
12945
12946Partition the string into three parts using the given separator.
12947
12948This will search for the separator in the string. If the separator is found,
12949returns a 3-tuple containing the part before the separator, the separator
12950itself, and the part after it.
12951
12952If the separator is not found, returns a 3-tuple containing the original string
12953and two empty strings.
12954[clinic start generated code]*/
12955
12956static PyObject *
12957unicode_partition(PyObject *self, PyObject *sep)
12958/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959{
INADA Naoki3ae20562017-01-16 20:41:20 +090012960 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012961}
12962
INADA Naoki3ae20562017-01-16 20:41:20 +090012963/*[clinic input]
12964str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012965
INADA Naoki3ae20562017-01-16 20:41:20 +090012966Partition the string into three parts using the given separator.
12967
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012968This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090012969the separator is found, returns a 3-tuple containing the part before the
12970separator, the separator itself, and the part after it.
12971
12972If the separator is not found, returns a 3-tuple containing two empty strings
12973and the original string.
12974[clinic start generated code]*/
12975
12976static PyObject *
12977unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012978/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979{
INADA Naoki3ae20562017-01-16 20:41:20 +090012980 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981}
12982
Alexander Belopolsky40018472011-02-26 01:02:56 +000012983PyObject *
12984PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012985{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012987 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012988
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012989 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012990}
12991
INADA Naoki3ae20562017-01-16 20:41:20 +090012992/*[clinic input]
12993str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012994
INADA Naoki3ae20562017-01-16 20:41:20 +090012995Return a list of the words in the string, using sep as the delimiter string.
12996
12997Splits are done starting at the end of the string and working to the front.
12998[clinic start generated code]*/
12999
13000static PyObject *
13001unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13002/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013003{
INADA Naoki3ae20562017-01-16 20:41:20 +090013004 if (sep == Py_None)
13005 return rsplit(self, NULL, maxsplit);
13006 if (PyUnicode_Check(sep))
13007 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013008
Victor Stinner998b8062018-09-12 00:23:25 +020013009 PyErr_Format(PyExc_TypeError,
13010 "must be str or None, not %.100s",
13011 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013012 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013013}
13014
INADA Naoki3ae20562017-01-16 20:41:20 +090013015/*[clinic input]
13016str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013018 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013019
13020Return a list of the lines in the string, breaking at line boundaries.
13021
13022Line breaks are not included in the resulting list unless keepends is given and
13023true.
13024[clinic start generated code]*/
13025
13026static PyObject *
13027unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013028/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013030 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031}
13032
13033static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013034PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013036 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037}
13038
INADA Naoki3ae20562017-01-16 20:41:20 +090013039/*[clinic input]
13040str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041
INADA Naoki3ae20562017-01-16 20:41:20 +090013042Convert uppercase characters to lowercase and lowercase characters to uppercase.
13043[clinic start generated code]*/
13044
13045static PyObject *
13046unicode_swapcase_impl(PyObject *self)
13047/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013049 if (PyUnicode_READY(self) == -1)
13050 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013051 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052}
13053
Larry Hastings61272b72014-01-07 12:41:53 -080013054/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013055
Larry Hastings31826802013-10-19 00:09:25 -070013056@staticmethod
13057str.maketrans as unicode_maketrans
13058
13059 x: object
13060
13061 y: unicode=NULL
13062
13063 z: unicode=NULL
13064
13065 /
13066
13067Return a translation table usable for str.translate().
13068
13069If there is only one argument, it must be a dictionary mapping Unicode
13070ordinals (integers) or characters to Unicode ordinals, strings or None.
13071Character keys will be then converted to ordinals.
13072If there are two arguments, they must be strings of equal length, and
13073in the resulting dictionary, each character in x will be mapped to the
13074character at the same position in y. If there is a third argument, it
13075must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013076[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013077
Larry Hastings31826802013-10-19 00:09:25 -070013078static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013079unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013080/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013081{
Georg Brandlceee0772007-11-27 23:48:05 +000013082 PyObject *new = NULL, *key, *value;
13083 Py_ssize_t i = 0;
13084 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013085
Georg Brandlceee0772007-11-27 23:48:05 +000013086 new = PyDict_New();
13087 if (!new)
13088 return NULL;
13089 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 int x_kind, y_kind, z_kind;
13091 void *x_data, *y_data, *z_data;
13092
Georg Brandlceee0772007-11-27 23:48:05 +000013093 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013094 if (!PyUnicode_Check(x)) {
13095 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13096 "be a string if there is a second argument");
13097 goto err;
13098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013100 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13101 "arguments must have equal length");
13102 goto err;
13103 }
13104 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 x_kind = PyUnicode_KIND(x);
13106 y_kind = PyUnicode_KIND(y);
13107 x_data = PyUnicode_DATA(x);
13108 y_data = PyUnicode_DATA(y);
13109 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13110 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013111 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013112 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013113 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013114 if (!value) {
13115 Py_DECREF(key);
13116 goto err;
13117 }
Georg Brandlceee0772007-11-27 23:48:05 +000013118 res = PyDict_SetItem(new, key, value);
13119 Py_DECREF(key);
13120 Py_DECREF(value);
13121 if (res < 0)
13122 goto err;
13123 }
13124 /* create entries for deleting chars in z */
13125 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 z_kind = PyUnicode_KIND(z);
13127 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013128 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013130 if (!key)
13131 goto err;
13132 res = PyDict_SetItem(new, key, Py_None);
13133 Py_DECREF(key);
13134 if (res < 0)
13135 goto err;
13136 }
13137 }
13138 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 int kind;
13140 void *data;
13141
Georg Brandlceee0772007-11-27 23:48:05 +000013142 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013143 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013144 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13145 "to maketrans it must be a dict");
13146 goto err;
13147 }
13148 /* copy entries into the new dict, converting string keys to int keys */
13149 while (PyDict_Next(x, &i, &key, &value)) {
13150 if (PyUnicode_Check(key)) {
13151 /* convert string keys to integer keys */
13152 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013153 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013154 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13155 "table must be of length 1");
13156 goto err;
13157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 kind = PyUnicode_KIND(key);
13159 data = PyUnicode_DATA(key);
13160 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013161 if (!newkey)
13162 goto err;
13163 res = PyDict_SetItem(new, newkey, value);
13164 Py_DECREF(newkey);
13165 if (res < 0)
13166 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013167 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013168 /* just keep integer keys */
13169 if (PyDict_SetItem(new, key, value) < 0)
13170 goto err;
13171 } else {
13172 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13173 "be strings or integers");
13174 goto err;
13175 }
13176 }
13177 }
13178 return new;
13179 err:
13180 Py_DECREF(new);
13181 return NULL;
13182}
13183
INADA Naoki3ae20562017-01-16 20:41:20 +090013184/*[clinic input]
13185str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186
INADA Naoki3ae20562017-01-16 20:41:20 +090013187 table: object
13188 Translation table, which must be a mapping of Unicode ordinals to
13189 Unicode ordinals, strings, or None.
13190 /
13191
13192Replace each character in the string using the given translation table.
13193
13194The table must implement lookup/indexing via __getitem__, for instance a
13195dictionary or list. If this operation raises LookupError, the character is
13196left untouched. Characters mapped to None are deleted.
13197[clinic start generated code]*/
13198
13199static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013201/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204}
13205
INADA Naoki3ae20562017-01-16 20:41:20 +090013206/*[clinic input]
13207str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
INADA Naoki3ae20562017-01-16 20:41:20 +090013209Return a copy of the string converted to uppercase.
13210[clinic start generated code]*/
13211
13212static PyObject *
13213unicode_upper_impl(PyObject *self)
13214/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013216 if (PyUnicode_READY(self) == -1)
13217 return NULL;
13218 if (PyUnicode_IS_ASCII(self))
13219 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013220 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221}
13222
INADA Naoki3ae20562017-01-16 20:41:20 +090013223/*[clinic input]
13224str.zfill as unicode_zfill
13225
13226 width: Py_ssize_t
13227 /
13228
13229Pad a numeric string with zeros on the left, to fill a field of the given width.
13230
13231The string is never truncated.
13232[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233
13234static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013235unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013236/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013238 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013239 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013240 int kind;
13241 void *data;
13242 Py_UCS4 chr;
13243
Benjamin Petersonbac79492012-01-14 13:34:47 -050013244 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246
Victor Stinnerc4b49542011-12-11 22:44:26 +010013247 if (PyUnicode_GET_LENGTH(self) >= width)
13248 return unicode_result_unchanged(self);
13249
13250 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251
13252 u = pad(self, fill, 0, '0');
13253
Walter Dörwald068325e2002-04-15 13:36:47 +000013254 if (u == NULL)
13255 return NULL;
13256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 kind = PyUnicode_KIND(u);
13258 data = PyUnicode_DATA(u);
13259 chr = PyUnicode_READ(kind, data, fill);
13260
13261 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 PyUnicode_WRITE(kind, data, 0, chr);
13264 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265 }
13266
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013267 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013268 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
13271#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013272static PyObject *
13273unicode__decimal2ascii(PyObject *self)
13274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013276}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277#endif
13278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013279PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013282Return True if S starts with the specified prefix, False otherwise.\n\
13283With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013284With optional end, stop comparing S at that position.\n\
13285prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
13287static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013288unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013291 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013292 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013293 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013294 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013295 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
Jesus Ceaac451502011-04-20 17:09:23 +020013297 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013299 if (PyTuple_Check(subobj)) {
13300 Py_ssize_t i;
13301 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013302 substring = PyTuple_GET_ITEM(subobj, i);
13303 if (!PyUnicode_Check(substring)) {
13304 PyErr_Format(PyExc_TypeError,
13305 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013306 "not %.100s",
13307 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013308 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013309 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013310 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013311 if (result == -1)
13312 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013313 if (result) {
13314 Py_RETURN_TRUE;
13315 }
13316 }
13317 /* nothing matched */
13318 Py_RETURN_FALSE;
13319 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013320 if (!PyUnicode_Check(subobj)) {
13321 PyErr_Format(PyExc_TypeError,
13322 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013323 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013324 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013325 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013326 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013327 if (result == -1)
13328 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330}
13331
13332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013333PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013336Return True if S ends with the specified suffix, False otherwise.\n\
13337With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013338With optional end, stop comparing S at that position.\n\
13339suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
13341static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013342unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013345 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013346 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013347 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013348 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013349 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350
Jesus Ceaac451502011-04-20 17:09:23 +020013351 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013353 if (PyTuple_Check(subobj)) {
13354 Py_ssize_t i;
13355 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013356 substring = PyTuple_GET_ITEM(subobj, i);
13357 if (!PyUnicode_Check(substring)) {
13358 PyErr_Format(PyExc_TypeError,
13359 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013360 "not %.100s",
13361 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013363 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013364 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013365 if (result == -1)
13366 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 if (result) {
13368 Py_RETURN_TRUE;
13369 }
13370 }
13371 Py_RETURN_FALSE;
13372 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013373 if (!PyUnicode_Check(subobj)) {
13374 PyErr_Format(PyExc_TypeError,
13375 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013376 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013378 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013379 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013380 if (result == -1)
13381 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383}
13384
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013385static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013386_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013387{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013388 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13389 writer->data = PyUnicode_DATA(writer->buffer);
13390
13391 if (!writer->readonly) {
13392 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013393 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013394 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013395 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013396 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13397 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13398 writer->kind = PyUnicode_WCHAR_KIND;
13399 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13400
Victor Stinner8f674cc2013-04-17 23:02:17 +020013401 /* Copy-on-write mode: set buffer size to 0 so
13402 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13403 * next write. */
13404 writer->size = 0;
13405 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013406}
13407
Victor Stinnerd3f08822012-05-29 12:57:52 +020013408void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013409_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013410{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013411 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013412
13413 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013414 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013415
13416 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13417 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13418 writer->kind = PyUnicode_WCHAR_KIND;
13419 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013420}
13421
Victor Stinnerd3f08822012-05-29 12:57:52 +020013422int
13423_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13424 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013425{
13426 Py_ssize_t newlen;
13427 PyObject *newbuffer;
13428
Victor Stinner2740e462016-09-06 16:58:36 -070013429 assert(maxchar <= MAX_UNICODE);
13430
Victor Stinnerca9381e2015-09-22 00:58:32 +020013431 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013432 assert((maxchar > writer->maxchar && length >= 0)
13433 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013434
Victor Stinner202fdca2012-05-07 12:47:02 +020013435 if (length > PY_SSIZE_T_MAX - writer->pos) {
13436 PyErr_NoMemory();
13437 return -1;
13438 }
13439 newlen = writer->pos + length;
13440
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013441 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013442
Victor Stinnerd3f08822012-05-29 12:57:52 +020013443 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013444 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013445 if (writer->overallocate
13446 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13447 /* overallocate to limit the number of realloc() */
13448 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013449 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013450 if (newlen < writer->min_length)
13451 newlen = writer->min_length;
13452
Victor Stinnerd3f08822012-05-29 12:57:52 +020013453 writer->buffer = PyUnicode_New(newlen, maxchar);
13454 if (writer->buffer == NULL)
13455 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013456 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013457 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013458 if (writer->overallocate
13459 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13460 /* overallocate to limit the number of realloc() */
13461 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013462 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013463 if (newlen < writer->min_length)
13464 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013465
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013466 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013467 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013468 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013469 newbuffer = PyUnicode_New(newlen, maxchar);
13470 if (newbuffer == NULL)
13471 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013472 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13473 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013474 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013475 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013476 }
13477 else {
13478 newbuffer = resize_compact(writer->buffer, newlen);
13479 if (newbuffer == NULL)
13480 return -1;
13481 }
13482 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013483 }
13484 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013485 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 newbuffer = PyUnicode_New(writer->size, maxchar);
13487 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013488 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013489 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13490 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013491 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013492 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013493 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013494 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013495
13496#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013497}
13498
Victor Stinnerca9381e2015-09-22 00:58:32 +020013499int
13500_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13501 enum PyUnicode_Kind kind)
13502{
13503 Py_UCS4 maxchar;
13504
13505 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13506 assert(writer->kind < kind);
13507
13508 switch (kind)
13509 {
13510 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13511 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13512 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13513 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013514 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013515 }
13516
13517 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13518}
13519
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013520static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013521_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013522{
Victor Stinner2740e462016-09-06 16:58:36 -070013523 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013524 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13525 return -1;
13526 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13527 writer->pos++;
13528 return 0;
13529}
13530
13531int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013532_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13533{
13534 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13535}
13536
13537int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13539{
13540 Py_UCS4 maxchar;
13541 Py_ssize_t len;
13542
13543 if (PyUnicode_READY(str) == -1)
13544 return -1;
13545 len = PyUnicode_GET_LENGTH(str);
13546 if (len == 0)
13547 return 0;
13548 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13549 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013550 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013551 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 Py_INCREF(str);
13554 writer->buffer = str;
13555 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013556 writer->pos += len;
13557 return 0;
13558 }
13559 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13560 return -1;
13561 }
13562 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13563 str, 0, len);
13564 writer->pos += len;
13565 return 0;
13566}
13567
Victor Stinnere215d962012-10-06 23:03:36 +020013568int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013569_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13570 Py_ssize_t start, Py_ssize_t end)
13571{
13572 Py_UCS4 maxchar;
13573 Py_ssize_t len;
13574
13575 if (PyUnicode_READY(str) == -1)
13576 return -1;
13577
13578 assert(0 <= start);
13579 assert(end <= PyUnicode_GET_LENGTH(str));
13580 assert(start <= end);
13581
13582 if (end == 0)
13583 return 0;
13584
13585 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13586 return _PyUnicodeWriter_WriteStr(writer, str);
13587
13588 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13589 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13590 else
13591 maxchar = writer->maxchar;
13592 len = end - start;
13593
13594 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13595 return -1;
13596
13597 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13598 str, start, len);
13599 writer->pos += len;
13600 return 0;
13601}
13602
13603int
Victor Stinner4a587072013-11-19 12:54:53 +010013604_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13605 const char *ascii, Py_ssize_t len)
13606{
13607 if (len == -1)
13608 len = strlen(ascii);
13609
13610 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13611
13612 if (writer->buffer == NULL && !writer->overallocate) {
13613 PyObject *str;
13614
13615 str = _PyUnicode_FromASCII(ascii, len);
13616 if (str == NULL)
13617 return -1;
13618
13619 writer->readonly = 1;
13620 writer->buffer = str;
13621 _PyUnicodeWriter_Update(writer);
13622 writer->pos += len;
13623 return 0;
13624 }
13625
13626 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13627 return -1;
13628
13629 switch (writer->kind)
13630 {
13631 case PyUnicode_1BYTE_KIND:
13632 {
13633 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13634 Py_UCS1 *data = writer->data;
13635
Christian Heimesf051e432016-09-13 20:22:02 +020013636 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013637 break;
13638 }
13639 case PyUnicode_2BYTE_KIND:
13640 {
13641 _PyUnicode_CONVERT_BYTES(
13642 Py_UCS1, Py_UCS2,
13643 ascii, ascii + len,
13644 (Py_UCS2 *)writer->data + writer->pos);
13645 break;
13646 }
13647 case PyUnicode_4BYTE_KIND:
13648 {
13649 _PyUnicode_CONVERT_BYTES(
13650 Py_UCS1, Py_UCS4,
13651 ascii, ascii + len,
13652 (Py_UCS4 *)writer->data + writer->pos);
13653 break;
13654 }
13655 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013656 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013657 }
13658
13659 writer->pos += len;
13660 return 0;
13661}
13662
13663int
13664_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13665 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013666{
13667 Py_UCS4 maxchar;
13668
13669 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13670 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13671 return -1;
13672 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13673 writer->pos += len;
13674 return 0;
13675}
13676
Victor Stinnerd3f08822012-05-29 12:57:52 +020013677PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013678_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013679{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013680 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013681
Victor Stinnerd3f08822012-05-29 12:57:52 +020013682 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013683 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013684 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013685 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013686
13687 str = writer->buffer;
13688 writer->buffer = NULL;
13689
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013690 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013691 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13692 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013693 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013694
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013695 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13696 PyObject *str2;
13697 str2 = resize_compact(str, writer->pos);
13698 if (str2 == NULL) {
13699 Py_DECREF(str);
13700 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013701 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013702 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013703 }
13704
Victor Stinner15a0bd32013-07-08 22:29:55 +020013705 assert(_PyUnicode_CheckConsistency(str, 1));
13706 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013707}
13708
Victor Stinnerd3f08822012-05-29 12:57:52 +020013709void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013710_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013711{
13712 Py_CLEAR(writer->buffer);
13713}
13714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013715#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013716
13717PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013719\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013720Return a formatted version of S, using substitutions from args and kwargs.\n\
13721The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013722
Eric Smith27bbca62010-11-04 17:06:58 +000013723PyDoc_STRVAR(format_map__doc__,
13724 "S.format_map(mapping) -> str\n\
13725\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013726Return a formatted version of S, using substitutions from mapping.\n\
13727The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013728
INADA Naoki3ae20562017-01-16 20:41:20 +090013729/*[clinic input]
13730str.__format__ as unicode___format__
13731
13732 format_spec: unicode
13733 /
13734
13735Return a formatted version of the string as described by format_spec.
13736[clinic start generated code]*/
13737
Eric Smith4a7d76d2008-05-30 18:10:19 +000013738static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013739unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013740/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013741{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013742 _PyUnicodeWriter writer;
13743 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013744
Victor Stinnerd3f08822012-05-29 12:57:52 +020013745 if (PyUnicode_READY(self) == -1)
13746 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013747 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013748 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13749 self, format_spec, 0,
13750 PyUnicode_GET_LENGTH(format_spec));
13751 if (ret == -1) {
13752 _PyUnicodeWriter_Dealloc(&writer);
13753 return NULL;
13754 }
13755 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013756}
13757
INADA Naoki3ae20562017-01-16 20:41:20 +090013758/*[clinic input]
13759str.__sizeof__ as unicode_sizeof
13760
13761Return the size of the string in memory, in bytes.
13762[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013763
13764static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013765unicode_sizeof_impl(PyObject *self)
13766/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013768 Py_ssize_t size;
13769
13770 /* If it's a compact object, account for base structure +
13771 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013772 if (PyUnicode_IS_COMPACT_ASCII(self))
13773 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13774 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013775 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013776 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013777 else {
13778 /* If it is a two-block object, account for base object, and
13779 for character block if present. */
13780 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013781 if (_PyUnicode_DATA_ANY(self))
13782 size += (PyUnicode_GET_LENGTH(self) + 1) *
13783 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013784 }
13785 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013786 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013787 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13788 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13789 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13790 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791
13792 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013793}
13794
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013795static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013796unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013797{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013798 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 if (!copy)
13800 return NULL;
13801 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013802}
13803
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013805 UNICODE_ENCODE_METHODDEF
13806 UNICODE_REPLACE_METHODDEF
13807 UNICODE_SPLIT_METHODDEF
13808 UNICODE_RSPLIT_METHODDEF
13809 UNICODE_JOIN_METHODDEF
13810 UNICODE_CAPITALIZE_METHODDEF
13811 UNICODE_CASEFOLD_METHODDEF
13812 UNICODE_TITLE_METHODDEF
13813 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013814 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013815 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013816 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013817 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013818 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013819 UNICODE_LJUST_METHODDEF
13820 UNICODE_LOWER_METHODDEF
13821 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013822 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13823 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013824 UNICODE_RJUST_METHODDEF
13825 UNICODE_RSTRIP_METHODDEF
13826 UNICODE_RPARTITION_METHODDEF
13827 UNICODE_SPLITLINES_METHODDEF
13828 UNICODE_STRIP_METHODDEF
13829 UNICODE_SWAPCASE_METHODDEF
13830 UNICODE_TRANSLATE_METHODDEF
13831 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013832 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13833 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013834 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013835 UNICODE_ISLOWER_METHODDEF
13836 UNICODE_ISUPPER_METHODDEF
13837 UNICODE_ISTITLE_METHODDEF
13838 UNICODE_ISSPACE_METHODDEF
13839 UNICODE_ISDECIMAL_METHODDEF
13840 UNICODE_ISDIGIT_METHODDEF
13841 UNICODE_ISNUMERIC_METHODDEF
13842 UNICODE_ISALPHA_METHODDEF
13843 UNICODE_ISALNUM_METHODDEF
13844 UNICODE_ISIDENTIFIER_METHODDEF
13845 UNICODE_ISPRINTABLE_METHODDEF
13846 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013847 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013848 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013850 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013851 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013852#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013853 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013854 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855#endif
13856
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013857 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013858 {NULL, NULL}
13859};
13860
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013861static PyObject *
13862unicode_mod(PyObject *v, PyObject *w)
13863{
Brian Curtindfc80e32011-08-10 20:28:54 -050013864 if (!PyUnicode_Check(v))
13865 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013867}
13868
13869static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013870 0, /*nb_add*/
13871 0, /*nb_subtract*/
13872 0, /*nb_multiply*/
13873 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013874};
13875
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 (lenfunc) unicode_length, /* sq_length */
13878 PyUnicode_Concat, /* sq_concat */
13879 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13880 (ssizeargfunc) unicode_getitem, /* sq_item */
13881 0, /* sq_slice */
13882 0, /* sq_ass_item */
13883 0, /* sq_ass_slice */
13884 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885};
13886
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013887static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013888unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 if (PyUnicode_READY(self) == -1)
13891 return NULL;
13892
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013893 if (PyIndex_Check(item)) {
13894 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013895 if (i == -1 && PyErr_Occurred())
13896 return NULL;
13897 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013898 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013899 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013900 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013901 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013902 PyObject *result;
13903 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013904 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013905 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013906
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013907 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013908 return NULL;
13909 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013910 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13911 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013912
13913 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013914 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013915 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013916 slicelength == PyUnicode_GET_LENGTH(self)) {
13917 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013918 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013919 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013920 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013921 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013922 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013923 src_kind = PyUnicode_KIND(self);
13924 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013925 if (!PyUnicode_IS_ASCII(self)) {
13926 kind_limit = kind_maxchar_limit(src_kind);
13927 max_char = 0;
13928 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13929 ch = PyUnicode_READ(src_kind, src_data, cur);
13930 if (ch > max_char) {
13931 max_char = ch;
13932 if (max_char >= kind_limit)
13933 break;
13934 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013935 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013936 }
Victor Stinner55c99112011-10-13 01:17:06 +020013937 else
13938 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013939 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013940 if (result == NULL)
13941 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013942 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013943 dest_data = PyUnicode_DATA(result);
13944
13945 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013946 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13947 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013948 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013949 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013950 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013951 } else {
13952 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13953 return NULL;
13954 }
13955}
13956
13957static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 (lenfunc)unicode_length, /* mp_length */
13959 (binaryfunc)unicode_subscript, /* mp_subscript */
13960 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013961};
13962
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964/* Helpers for PyUnicode_Format() */
13965
Victor Stinnera47082312012-10-04 02:19:54 +020013966struct unicode_formatter_t {
13967 PyObject *args;
13968 int args_owned;
13969 Py_ssize_t arglen, argidx;
13970 PyObject *dict;
13971
13972 enum PyUnicode_Kind fmtkind;
13973 Py_ssize_t fmtcnt, fmtpos;
13974 void *fmtdata;
13975 PyObject *fmtstr;
13976
13977 _PyUnicodeWriter writer;
13978};
13979
13980struct unicode_format_arg_t {
13981 Py_UCS4 ch;
13982 int flags;
13983 Py_ssize_t width;
13984 int prec;
13985 int sign;
13986};
13987
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013989unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990{
Victor Stinnera47082312012-10-04 02:19:54 +020013991 Py_ssize_t argidx = ctx->argidx;
13992
13993 if (argidx < ctx->arglen) {
13994 ctx->argidx++;
13995 if (ctx->arglen < 0)
13996 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013997 else
Victor Stinnera47082312012-10-04 02:19:54 +020013998 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999 }
14000 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014001 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014002 return NULL;
14003}
14004
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014005/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014006
Victor Stinnera47082312012-10-04 02:19:54 +020014007/* Format a float into the writer if the writer is not NULL, or into *p_output
14008 otherwise.
14009
14010 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014011static int
Victor Stinnera47082312012-10-04 02:19:54 +020014012formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14013 PyObject **p_output,
14014 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014015{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014016 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014018 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014019 int prec;
14020 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014021
Guido van Rossumd57fd912000-03-10 22:53:23 +000014022 x = PyFloat_AsDouble(v);
14023 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014024 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014025
Victor Stinnera47082312012-10-04 02:19:54 +020014026 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014027 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014029
Victor Stinnera47082312012-10-04 02:19:54 +020014030 if (arg->flags & F_ALT)
14031 dtoa_flags = Py_DTSF_ALT;
14032 else
14033 dtoa_flags = 0;
14034 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014035 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014036 return -1;
14037 len = strlen(p);
14038 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014039 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014040 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014041 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014042 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014043 }
14044 else
14045 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014046 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014047 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048}
14049
Victor Stinnerd0880d52012-04-27 23:40:13 +020014050/* formatlong() emulates the format codes d, u, o, x and X, and
14051 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14052 * Python's regular ints.
14053 * Return value: a new PyUnicodeObject*, or NULL if error.
14054 * The output string is of the form
14055 * "-"? ("0x" | "0X")? digit+
14056 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14057 * set in flags. The case of hex digits will be correct,
14058 * There will be at least prec digits, zero-filled on the left if
14059 * necessary to get that many.
14060 * val object to be converted
14061 * flags bitmask of format flags; only F_ALT is looked at
14062 * prec minimum number of digits; 0-fill on left if needed
14063 * type a character in [duoxX]; u acts the same as d
14064 *
14065 * CAUTION: o, x and X conversions on regular ints can never
14066 * produce a '-' sign, but can for Python's unbounded ints.
14067 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014068PyObject *
14069_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014070{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014071 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014073 Py_ssize_t i;
14074 int sign; /* 1 if '-', else 0 */
14075 int len; /* number of characters */
14076 Py_ssize_t llen;
14077 int numdigits; /* len == numnondigits + numdigits */
14078 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014079
Victor Stinnerd0880d52012-04-27 23:40:13 +020014080 /* Avoid exceeding SSIZE_T_MAX */
14081 if (prec > INT_MAX-3) {
14082 PyErr_SetString(PyExc_OverflowError,
14083 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014084 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014085 }
14086
14087 assert(PyLong_Check(val));
14088
14089 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014091 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014092 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014093 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014094 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014095 /* int and int subclasses should print numerically when a numeric */
14096 /* format code is used (see issue18780) */
14097 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014098 break;
14099 case 'o':
14100 numnondigits = 2;
14101 result = PyNumber_ToBase(val, 8);
14102 break;
14103 case 'x':
14104 case 'X':
14105 numnondigits = 2;
14106 result = PyNumber_ToBase(val, 16);
14107 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014108 }
14109 if (!result)
14110 return NULL;
14111
14112 assert(unicode_modifiable(result));
14113 assert(PyUnicode_IS_READY(result));
14114 assert(PyUnicode_IS_ASCII(result));
14115
14116 /* To modify the string in-place, there can only be one reference. */
14117 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014118 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014119 PyErr_BadInternalCall();
14120 return NULL;
14121 }
14122 buf = PyUnicode_DATA(result);
14123 llen = PyUnicode_GET_LENGTH(result);
14124 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014125 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014126 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014127 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014128 return NULL;
14129 }
14130 len = (int)llen;
14131 sign = buf[0] == '-';
14132 numnondigits += sign;
14133 numdigits = len - numnondigits;
14134 assert(numdigits > 0);
14135
14136 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014137 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014138 (type == 'o' || type == 'x' || type == 'X'))) {
14139 assert(buf[sign] == '0');
14140 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14141 buf[sign+1] == 'o');
14142 numnondigits -= 2;
14143 buf += 2;
14144 len -= 2;
14145 if (sign)
14146 buf[0] = '-';
14147 assert(len == numnondigits + numdigits);
14148 assert(numdigits > 0);
14149 }
14150
14151 /* Fill with leading zeroes to meet minimum width. */
14152 if (prec > numdigits) {
14153 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14154 numnondigits + prec);
14155 char *b1;
14156 if (!r1) {
14157 Py_DECREF(result);
14158 return NULL;
14159 }
14160 b1 = PyBytes_AS_STRING(r1);
14161 for (i = 0; i < numnondigits; ++i)
14162 *b1++ = *buf++;
14163 for (i = 0; i < prec - numdigits; i++)
14164 *b1++ = '0';
14165 for (i = 0; i < numdigits; i++)
14166 *b1++ = *buf++;
14167 *b1 = '\0';
14168 Py_DECREF(result);
14169 result = r1;
14170 buf = PyBytes_AS_STRING(result);
14171 len = numnondigits + prec;
14172 }
14173
14174 /* Fix up case for hex conversions. */
14175 if (type == 'X') {
14176 /* Need to convert all lower case letters to upper case.
14177 and need to convert 0x to 0X (and -0x to -0X). */
14178 for (i = 0; i < len; i++)
14179 if (buf[i] >= 'a' && buf[i] <= 'x')
14180 buf[i] -= 'a'-'A';
14181 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014182 if (!PyUnicode_Check(result)
14183 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014184 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014185 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 Py_DECREF(result);
14187 result = unicode;
14188 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014189 else if (len != PyUnicode_GET_LENGTH(result)) {
14190 if (PyUnicode_Resize(&result, len) < 0)
14191 Py_CLEAR(result);
14192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014194}
14195
Ethan Furmandf3ed242014-01-05 06:50:30 -080014196/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014198 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014199 * -1 and raise an exception on error */
14200static int
Victor Stinnera47082312012-10-04 02:19:54 +020014201mainformatlong(PyObject *v,
14202 struct unicode_format_arg_t *arg,
14203 PyObject **p_output,
14204 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014205{
14206 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014207 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014208
14209 if (!PyNumber_Check(v))
14210 goto wrongtype;
14211
Ethan Furman9ab74802014-03-21 06:38:46 -070014212 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014213 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014214 if (type == 'o' || type == 'x' || type == 'X') {
14215 iobj = PyNumber_Index(v);
14216 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014217 if (PyErr_ExceptionMatches(PyExc_TypeError))
14218 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014219 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014220 }
14221 }
14222 else {
14223 iobj = PyNumber_Long(v);
14224 if (iobj == NULL ) {
14225 if (PyErr_ExceptionMatches(PyExc_TypeError))
14226 goto wrongtype;
14227 return -1;
14228 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014229 }
14230 assert(PyLong_Check(iobj));
14231 }
14232 else {
14233 iobj = v;
14234 Py_INCREF(iobj);
14235 }
14236
14237 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014238 && arg->width == -1 && arg->prec == -1
14239 && !(arg->flags & (F_SIGN | F_BLANK))
14240 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014241 {
14242 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014243 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014244 int base;
14245
Victor Stinnera47082312012-10-04 02:19:54 +020014246 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014247 {
14248 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014249 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014250 case 'd':
14251 case 'i':
14252 case 'u':
14253 base = 10;
14254 break;
14255 case 'o':
14256 base = 8;
14257 break;
14258 case 'x':
14259 case 'X':
14260 base = 16;
14261 break;
14262 }
14263
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014264 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14265 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014266 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014267 }
14268 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014269 return 1;
14270 }
14271
Ethan Furmanb95b5612015-01-23 20:05:18 -080014272 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014273 Py_DECREF(iobj);
14274 if (res == NULL)
14275 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014276 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 return 0;
14278
14279wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014280 switch(type)
14281 {
14282 case 'o':
14283 case 'x':
14284 case 'X':
14285 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014286 "%%%c format: an integer is required, "
14287 "not %.200s",
14288 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014289 break;
14290 default:
14291 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014292 "%%%c format: a number is required, "
14293 "not %.200s",
14294 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014295 break;
14296 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297 return -1;
14298}
14299
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014300static Py_UCS4
14301formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014303 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014304 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014305 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014306 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014307 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014308 goto onError;
14309 }
14310 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014311 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014312 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014313 /* make sure number is a type of integer */
14314 if (!PyLong_Check(v)) {
14315 iobj = PyNumber_Index(v);
14316 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014317 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014318 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014319 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014320 Py_DECREF(iobj);
14321 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014322 else {
14323 x = PyLong_AsLong(v);
14324 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014325 if (x == -1 && PyErr_Occurred())
14326 goto onError;
14327
Victor Stinner8faf8212011-12-08 22:14:11 +010014328 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014329 PyErr_SetString(PyExc_OverflowError,
14330 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014331 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014332 }
14333
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014334 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014335 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014336
Benjamin Peterson29060642009-01-31 22:14:21 +000014337 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014338 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014339 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014340 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014341}
14342
Victor Stinnera47082312012-10-04 02:19:54 +020014343/* Parse options of an argument: flags, width, precision.
14344 Handle also "%(name)" syntax.
14345
14346 Return 0 if the argument has been formatted into arg->str.
14347 Return 1 if the argument has been written into ctx->writer,
14348 Raise an exception and return -1 on error. */
14349static int
14350unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14351 struct unicode_format_arg_t *arg)
14352{
14353#define FORMAT_READ(ctx) \
14354 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14355
14356 PyObject *v;
14357
Victor Stinnera47082312012-10-04 02:19:54 +020014358 if (arg->ch == '(') {
14359 /* Get argument value from a dictionary. Example: "%(name)s". */
14360 Py_ssize_t keystart;
14361 Py_ssize_t keylen;
14362 PyObject *key;
14363 int pcount = 1;
14364
14365 if (ctx->dict == NULL) {
14366 PyErr_SetString(PyExc_TypeError,
14367 "format requires a mapping");
14368 return -1;
14369 }
14370 ++ctx->fmtpos;
14371 --ctx->fmtcnt;
14372 keystart = ctx->fmtpos;
14373 /* Skip over balanced parentheses */
14374 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14375 arg->ch = FORMAT_READ(ctx);
14376 if (arg->ch == ')')
14377 --pcount;
14378 else if (arg->ch == '(')
14379 ++pcount;
14380 ctx->fmtpos++;
14381 }
14382 keylen = ctx->fmtpos - keystart - 1;
14383 if (ctx->fmtcnt < 0 || pcount > 0) {
14384 PyErr_SetString(PyExc_ValueError,
14385 "incomplete format key");
14386 return -1;
14387 }
14388 key = PyUnicode_Substring(ctx->fmtstr,
14389 keystart, keystart + keylen);
14390 if (key == NULL)
14391 return -1;
14392 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014393 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014394 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014395 }
14396 ctx->args = PyObject_GetItem(ctx->dict, key);
14397 Py_DECREF(key);
14398 if (ctx->args == NULL)
14399 return -1;
14400 ctx->args_owned = 1;
14401 ctx->arglen = -1;
14402 ctx->argidx = -2;
14403 }
14404
14405 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014406 while (--ctx->fmtcnt >= 0) {
14407 arg->ch = FORMAT_READ(ctx);
14408 ctx->fmtpos++;
14409 switch (arg->ch) {
14410 case '-': arg->flags |= F_LJUST; continue;
14411 case '+': arg->flags |= F_SIGN; continue;
14412 case ' ': arg->flags |= F_BLANK; continue;
14413 case '#': arg->flags |= F_ALT; continue;
14414 case '0': arg->flags |= F_ZERO; continue;
14415 }
14416 break;
14417 }
14418
14419 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014420 if (arg->ch == '*') {
14421 v = unicode_format_getnextarg(ctx);
14422 if (v == NULL)
14423 return -1;
14424 if (!PyLong_Check(v)) {
14425 PyErr_SetString(PyExc_TypeError,
14426 "* wants int");
14427 return -1;
14428 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014429 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014430 if (arg->width == -1 && PyErr_Occurred())
14431 return -1;
14432 if (arg->width < 0) {
14433 arg->flags |= F_LJUST;
14434 arg->width = -arg->width;
14435 }
14436 if (--ctx->fmtcnt >= 0) {
14437 arg->ch = FORMAT_READ(ctx);
14438 ctx->fmtpos++;
14439 }
14440 }
14441 else if (arg->ch >= '0' && arg->ch <= '9') {
14442 arg->width = arg->ch - '0';
14443 while (--ctx->fmtcnt >= 0) {
14444 arg->ch = FORMAT_READ(ctx);
14445 ctx->fmtpos++;
14446 if (arg->ch < '0' || arg->ch > '9')
14447 break;
14448 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14449 mixing signed and unsigned comparison. Since arg->ch is between
14450 '0' and '9', casting to int is safe. */
14451 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14452 PyErr_SetString(PyExc_ValueError,
14453 "width too big");
14454 return -1;
14455 }
14456 arg->width = arg->width*10 + (arg->ch - '0');
14457 }
14458 }
14459
14460 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014461 if (arg->ch == '.') {
14462 arg->prec = 0;
14463 if (--ctx->fmtcnt >= 0) {
14464 arg->ch = FORMAT_READ(ctx);
14465 ctx->fmtpos++;
14466 }
14467 if (arg->ch == '*') {
14468 v = unicode_format_getnextarg(ctx);
14469 if (v == NULL)
14470 return -1;
14471 if (!PyLong_Check(v)) {
14472 PyErr_SetString(PyExc_TypeError,
14473 "* wants int");
14474 return -1;
14475 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014476 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014477 if (arg->prec == -1 && PyErr_Occurred())
14478 return -1;
14479 if (arg->prec < 0)
14480 arg->prec = 0;
14481 if (--ctx->fmtcnt >= 0) {
14482 arg->ch = FORMAT_READ(ctx);
14483 ctx->fmtpos++;
14484 }
14485 }
14486 else if (arg->ch >= '0' && arg->ch <= '9') {
14487 arg->prec = arg->ch - '0';
14488 while (--ctx->fmtcnt >= 0) {
14489 arg->ch = FORMAT_READ(ctx);
14490 ctx->fmtpos++;
14491 if (arg->ch < '0' || arg->ch > '9')
14492 break;
14493 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14494 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014495 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014496 return -1;
14497 }
14498 arg->prec = arg->prec*10 + (arg->ch - '0');
14499 }
14500 }
14501 }
14502
14503 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14504 if (ctx->fmtcnt >= 0) {
14505 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14506 if (--ctx->fmtcnt >= 0) {
14507 arg->ch = FORMAT_READ(ctx);
14508 ctx->fmtpos++;
14509 }
14510 }
14511 }
14512 if (ctx->fmtcnt < 0) {
14513 PyErr_SetString(PyExc_ValueError,
14514 "incomplete format");
14515 return -1;
14516 }
14517 return 0;
14518
14519#undef FORMAT_READ
14520}
14521
14522/* Format one argument. Supported conversion specifiers:
14523
14524 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014525 - "i", "d", "u": int or float
14526 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014527 - "e", "E", "f", "F", "g", "G": float
14528 - "c": int or str (1 character)
14529
Victor Stinner8dbd4212012-12-04 09:30:24 +010014530 When possible, the output is written directly into the Unicode writer
14531 (ctx->writer). A string is created when padding is required.
14532
Victor Stinnera47082312012-10-04 02:19:54 +020014533 Return 0 if the argument has been formatted into *p_str,
14534 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014535 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014536static int
14537unicode_format_arg_format(struct unicode_formatter_t *ctx,
14538 struct unicode_format_arg_t *arg,
14539 PyObject **p_str)
14540{
14541 PyObject *v;
14542 _PyUnicodeWriter *writer = &ctx->writer;
14543
14544 if (ctx->fmtcnt == 0)
14545 ctx->writer.overallocate = 0;
14546
Victor Stinnera47082312012-10-04 02:19:54 +020014547 v = unicode_format_getnextarg(ctx);
14548 if (v == NULL)
14549 return -1;
14550
Victor Stinnera47082312012-10-04 02:19:54 +020014551
14552 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014553 case 's':
14554 case 'r':
14555 case 'a':
14556 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14557 /* Fast path */
14558 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14559 return -1;
14560 return 1;
14561 }
14562
14563 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14564 *p_str = v;
14565 Py_INCREF(*p_str);
14566 }
14567 else {
14568 if (arg->ch == 's')
14569 *p_str = PyObject_Str(v);
14570 else if (arg->ch == 'r')
14571 *p_str = PyObject_Repr(v);
14572 else
14573 *p_str = PyObject_ASCII(v);
14574 }
14575 break;
14576
14577 case 'i':
14578 case 'd':
14579 case 'u':
14580 case 'o':
14581 case 'x':
14582 case 'X':
14583 {
14584 int ret = mainformatlong(v, arg, p_str, writer);
14585 if (ret != 0)
14586 return ret;
14587 arg->sign = 1;
14588 break;
14589 }
14590
14591 case 'e':
14592 case 'E':
14593 case 'f':
14594 case 'F':
14595 case 'g':
14596 case 'G':
14597 if (arg->width == -1 && arg->prec == -1
14598 && !(arg->flags & (F_SIGN | F_BLANK)))
14599 {
14600 /* Fast path */
14601 if (formatfloat(v, arg, NULL, writer) == -1)
14602 return -1;
14603 return 1;
14604 }
14605
14606 arg->sign = 1;
14607 if (formatfloat(v, arg, p_str, NULL) == -1)
14608 return -1;
14609 break;
14610
14611 case 'c':
14612 {
14613 Py_UCS4 ch = formatchar(v);
14614 if (ch == (Py_UCS4) -1)
14615 return -1;
14616 if (arg->width == -1 && arg->prec == -1) {
14617 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014618 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014619 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014620 return 1;
14621 }
14622 *p_str = PyUnicode_FromOrdinal(ch);
14623 break;
14624 }
14625
14626 default:
14627 PyErr_Format(PyExc_ValueError,
14628 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014629 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014630 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14631 (int)arg->ch,
14632 ctx->fmtpos - 1);
14633 return -1;
14634 }
14635 if (*p_str == NULL)
14636 return -1;
14637 assert (PyUnicode_Check(*p_str));
14638 return 0;
14639}
14640
14641static int
14642unicode_format_arg_output(struct unicode_formatter_t *ctx,
14643 struct unicode_format_arg_t *arg,
14644 PyObject *str)
14645{
14646 Py_ssize_t len;
14647 enum PyUnicode_Kind kind;
14648 void *pbuf;
14649 Py_ssize_t pindex;
14650 Py_UCS4 signchar;
14651 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014652 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014653 Py_ssize_t sublen;
14654 _PyUnicodeWriter *writer = &ctx->writer;
14655 Py_UCS4 fill;
14656
14657 fill = ' ';
14658 if (arg->sign && arg->flags & F_ZERO)
14659 fill = '0';
14660
14661 if (PyUnicode_READY(str) == -1)
14662 return -1;
14663
14664 len = PyUnicode_GET_LENGTH(str);
14665 if ((arg->width == -1 || arg->width <= len)
14666 && (arg->prec == -1 || arg->prec >= len)
14667 && !(arg->flags & (F_SIGN | F_BLANK)))
14668 {
14669 /* Fast path */
14670 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14671 return -1;
14672 return 0;
14673 }
14674
14675 /* Truncate the string for "s", "r" and "a" formats
14676 if the precision is set */
14677 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14678 if (arg->prec >= 0 && len > arg->prec)
14679 len = arg->prec;
14680 }
14681
14682 /* Adjust sign and width */
14683 kind = PyUnicode_KIND(str);
14684 pbuf = PyUnicode_DATA(str);
14685 pindex = 0;
14686 signchar = '\0';
14687 if (arg->sign) {
14688 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14689 if (ch == '-' || ch == '+') {
14690 signchar = ch;
14691 len--;
14692 pindex++;
14693 }
14694 else if (arg->flags & F_SIGN)
14695 signchar = '+';
14696 else if (arg->flags & F_BLANK)
14697 signchar = ' ';
14698 else
14699 arg->sign = 0;
14700 }
14701 if (arg->width < len)
14702 arg->width = len;
14703
14704 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014705 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014706 if (!(arg->flags & F_LJUST)) {
14707 if (arg->sign) {
14708 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014709 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014710 }
14711 else {
14712 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014713 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014714 }
14715 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014716 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14717 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014718 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014719 }
14720
Victor Stinnera47082312012-10-04 02:19:54 +020014721 buflen = arg->width;
14722 if (arg->sign && len == arg->width)
14723 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014724 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014725 return -1;
14726
14727 /* Write the sign if needed */
14728 if (arg->sign) {
14729 if (fill != ' ') {
14730 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14731 writer->pos += 1;
14732 }
14733 if (arg->width > len)
14734 arg->width--;
14735 }
14736
14737 /* Write the numeric prefix for "x", "X" and "o" formats
14738 if the alternate form is used.
14739 For example, write "0x" for the "%#x" format. */
14740 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14741 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14742 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14743 if (fill != ' ') {
14744 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14745 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14746 writer->pos += 2;
14747 pindex += 2;
14748 }
14749 arg->width -= 2;
14750 if (arg->width < 0)
14751 arg->width = 0;
14752 len -= 2;
14753 }
14754
14755 /* Pad left with the fill character if needed */
14756 if (arg->width > len && !(arg->flags & F_LJUST)) {
14757 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014758 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014759 writer->pos += sublen;
14760 arg->width = len;
14761 }
14762
14763 /* If padding with spaces: write sign if needed and/or numeric prefix if
14764 the alternate form is used */
14765 if (fill == ' ') {
14766 if (arg->sign) {
14767 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14768 writer->pos += 1;
14769 }
14770 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14771 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14772 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14773 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14774 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14775 writer->pos += 2;
14776 pindex += 2;
14777 }
14778 }
14779
14780 /* Write characters */
14781 if (len) {
14782 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14783 str, pindex, len);
14784 writer->pos += len;
14785 }
14786
14787 /* Pad right with the fill character if needed */
14788 if (arg->width > len) {
14789 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014790 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014791 writer->pos += sublen;
14792 }
14793 return 0;
14794}
14795
14796/* Helper of PyUnicode_Format(): format one arg.
14797 Return 0 on success, raise an exception and return -1 on error. */
14798static int
14799unicode_format_arg(struct unicode_formatter_t *ctx)
14800{
14801 struct unicode_format_arg_t arg;
14802 PyObject *str;
14803 int ret;
14804
Victor Stinner8dbd4212012-12-04 09:30:24 +010014805 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014806 if (arg.ch == '%') {
14807 ctx->fmtpos++;
14808 ctx->fmtcnt--;
14809 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14810 return -1;
14811 return 0;
14812 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014813 arg.flags = 0;
14814 arg.width = -1;
14815 arg.prec = -1;
14816 arg.sign = 0;
14817 str = NULL;
14818
Victor Stinnera47082312012-10-04 02:19:54 +020014819 ret = unicode_format_arg_parse(ctx, &arg);
14820 if (ret == -1)
14821 return -1;
14822
14823 ret = unicode_format_arg_format(ctx, &arg, &str);
14824 if (ret == -1)
14825 return -1;
14826
14827 if (ret != 1) {
14828 ret = unicode_format_arg_output(ctx, &arg, str);
14829 Py_DECREF(str);
14830 if (ret == -1)
14831 return -1;
14832 }
14833
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014834 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014835 PyErr_SetString(PyExc_TypeError,
14836 "not all arguments converted during string formatting");
14837 return -1;
14838 }
14839 return 0;
14840}
14841
Alexander Belopolsky40018472011-02-26 01:02:56 +000014842PyObject *
14843PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014844{
Victor Stinnera47082312012-10-04 02:19:54 +020014845 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014846
Guido van Rossumd57fd912000-03-10 22:53:23 +000014847 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014848 PyErr_BadInternalCall();
14849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014850 }
Victor Stinnera47082312012-10-04 02:19:54 +020014851
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014852 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014853 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014854
14855 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014856 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14857 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14858 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14859 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014860
Victor Stinner8f674cc2013-04-17 23:02:17 +020014861 _PyUnicodeWriter_Init(&ctx.writer);
14862 ctx.writer.min_length = ctx.fmtcnt + 100;
14863 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014864
Guido van Rossumd57fd912000-03-10 22:53:23 +000014865 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014866 ctx.arglen = PyTuple_Size(args);
14867 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014868 }
14869 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014870 ctx.arglen = -1;
14871 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014872 }
Victor Stinnera47082312012-10-04 02:19:54 +020014873 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014874 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014875 ctx.dict = args;
14876 else
14877 ctx.dict = NULL;
14878 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879
Victor Stinnera47082312012-10-04 02:19:54 +020014880 while (--ctx.fmtcnt >= 0) {
14881 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014882 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014883
14884 nonfmtpos = ctx.fmtpos++;
14885 while (ctx.fmtcnt >= 0 &&
14886 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14887 ctx.fmtpos++;
14888 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014889 }
Victor Stinnera47082312012-10-04 02:19:54 +020014890 if (ctx.fmtcnt < 0) {
14891 ctx.fmtpos--;
14892 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014893 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014894
Victor Stinnercfc4c132013-04-03 01:48:39 +020014895 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14896 nonfmtpos, ctx.fmtpos) < 0)
14897 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014898 }
14899 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014900 ctx.fmtpos++;
14901 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014902 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014903 }
14904 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014905
Victor Stinnera47082312012-10-04 02:19:54 +020014906 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014907 PyErr_SetString(PyExc_TypeError,
14908 "not all arguments converted during string formatting");
14909 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014910 }
14911
Victor Stinnera47082312012-10-04 02:19:54 +020014912 if (ctx.args_owned) {
14913 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014914 }
Victor Stinnera47082312012-10-04 02:19:54 +020014915 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014916
Benjamin Peterson29060642009-01-31 22:14:21 +000014917 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014918 _PyUnicodeWriter_Dealloc(&ctx.writer);
14919 if (ctx.args_owned) {
14920 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921 }
14922 return NULL;
14923}
14924
Jeremy Hylton938ace62002-07-17 16:30:39 +000014925static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014926unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14927
Tim Peters6d6c1a32001-08-02 04:15:00 +000014928static PyObject *
14929unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14930{
Benjamin Peterson29060642009-01-31 22:14:21 +000014931 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014932 static char *kwlist[] = {"object", "encoding", "errors", 0};
14933 char *encoding = NULL;
14934 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014935
Benjamin Peterson14339b62009-01-31 16:36:08 +000014936 if (type != &PyUnicode_Type)
14937 return unicode_subtype_new(type, args, kwds);
14938 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014939 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014940 return NULL;
14941 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014942 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014943 if (encoding == NULL && errors == NULL)
14944 return PyObject_Str(x);
14945 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014946 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014947}
14948
Guido van Rossume023fe02001-08-30 03:12:59 +000014949static PyObject *
14950unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14951{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014952 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014953 Py_ssize_t length, char_size;
14954 int share_wstr, share_utf8;
14955 unsigned int kind;
14956 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014957
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014959
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014960 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014961 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014962 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014963 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014964 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014965 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014966 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014967 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014968
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014969 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014970 if (self == NULL) {
14971 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 return NULL;
14973 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014974 kind = PyUnicode_KIND(unicode);
14975 length = PyUnicode_GET_LENGTH(unicode);
14976
14977 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014978#ifdef Py_DEBUG
14979 _PyUnicode_HASH(self) = -1;
14980#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014981 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014982#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014983 _PyUnicode_STATE(self).interned = 0;
14984 _PyUnicode_STATE(self).kind = kind;
14985 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014986 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014987 _PyUnicode_STATE(self).ready = 1;
14988 _PyUnicode_WSTR(self) = NULL;
14989 _PyUnicode_UTF8_LENGTH(self) = 0;
14990 _PyUnicode_UTF8(self) = NULL;
14991 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014992 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014993
14994 share_utf8 = 0;
14995 share_wstr = 0;
14996 if (kind == PyUnicode_1BYTE_KIND) {
14997 char_size = 1;
14998 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14999 share_utf8 = 1;
15000 }
15001 else if (kind == PyUnicode_2BYTE_KIND) {
15002 char_size = 2;
15003 if (sizeof(wchar_t) == 2)
15004 share_wstr = 1;
15005 }
15006 else {
15007 assert(kind == PyUnicode_4BYTE_KIND);
15008 char_size = 4;
15009 if (sizeof(wchar_t) == 4)
15010 share_wstr = 1;
15011 }
15012
15013 /* Ensure we won't overflow the length. */
15014 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15015 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015016 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015018 data = PyObject_MALLOC((length + 1) * char_size);
15019 if (data == NULL) {
15020 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015021 goto onError;
15022 }
15023
Victor Stinnerc3c74152011-10-02 20:39:55 +020015024 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015025 if (share_utf8) {
15026 _PyUnicode_UTF8_LENGTH(self) = length;
15027 _PyUnicode_UTF8(self) = data;
15028 }
15029 if (share_wstr) {
15030 _PyUnicode_WSTR_LENGTH(self) = length;
15031 _PyUnicode_WSTR(self) = (wchar_t *)data;
15032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015033
Christian Heimesf051e432016-09-13 20:22:02 +020015034 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015035 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015036 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015037#ifdef Py_DEBUG
15038 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15039#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015040 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015041 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015042
15043onError:
15044 Py_DECREF(unicode);
15045 Py_DECREF(self);
15046 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015047}
15048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015049PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015050"str(object='') -> str\n\
15051str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015052\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015053Create a new string object from the given object. If encoding or\n\
15054errors is specified, then the object must expose a data buffer\n\
15055that will be decoded using the given encoding and error handler.\n\
15056Otherwise, returns the result of object.__str__() (if defined)\n\
15057or repr(object).\n\
15058encoding defaults to sys.getdefaultencoding().\n\
15059errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015060
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015061static PyObject *unicode_iter(PyObject *seq);
15062
Guido van Rossumd57fd912000-03-10 22:53:23 +000015063PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015064 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015065 "str", /* tp_name */
15066 sizeof(PyUnicodeObject), /* tp_basicsize */
15067 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015068 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015069 (destructor)unicode_dealloc, /* tp_dealloc */
15070 0, /* tp_print */
15071 0, /* tp_getattr */
15072 0, /* tp_setattr */
15073 0, /* tp_reserved */
15074 unicode_repr, /* tp_repr */
15075 &unicode_as_number, /* tp_as_number */
15076 &unicode_as_sequence, /* tp_as_sequence */
15077 &unicode_as_mapping, /* tp_as_mapping */
15078 (hashfunc) unicode_hash, /* tp_hash*/
15079 0, /* tp_call*/
15080 (reprfunc) unicode_str, /* tp_str */
15081 PyObject_GenericGetAttr, /* tp_getattro */
15082 0, /* tp_setattro */
15083 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015084 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015085 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15086 unicode_doc, /* tp_doc */
15087 0, /* tp_traverse */
15088 0, /* tp_clear */
15089 PyUnicode_RichCompare, /* tp_richcompare */
15090 0, /* tp_weaklistoffset */
15091 unicode_iter, /* tp_iter */
15092 0, /* tp_iternext */
15093 unicode_methods, /* tp_methods */
15094 0, /* tp_members */
15095 0, /* tp_getset */
15096 &PyBaseObject_Type, /* tp_base */
15097 0, /* tp_dict */
15098 0, /* tp_descr_get */
15099 0, /* tp_descr_set */
15100 0, /* tp_dictoffset */
15101 0, /* tp_init */
15102 0, /* tp_alloc */
15103 unicode_new, /* tp_new */
15104 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015105};
15106
15107/* Initialize the Unicode implementation */
15108
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015109_PyInitError
15110_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015111{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015112 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015113 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015114 0x000A, /* LINE FEED */
15115 0x000D, /* CARRIAGE RETURN */
15116 0x001C, /* FILE SEPARATOR */
15117 0x001D, /* GROUP SEPARATOR */
15118 0x001E, /* RECORD SEPARATOR */
15119 0x0085, /* NEXT LINE */
15120 0x2028, /* LINE SEPARATOR */
15121 0x2029, /* PARAGRAPH SEPARATOR */
15122 };
15123
Fred Drakee4315f52000-05-09 19:53:39 +000015124 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015125 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015126 if (!unicode_empty) {
15127 return _Py_INIT_ERR("Can't create empty string");
15128 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015129 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015130
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015131 if (PyType_Ready(&PyUnicode_Type) < 0) {
15132 return _Py_INIT_ERR("Can't initialize unicode type");
15133 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015134
15135 /* initialize the linebreak bloom filter */
15136 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015137 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015138 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015139
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015140 if (PyType_Ready(&EncodingMapType) < 0) {
15141 return _Py_INIT_ERR("Can't initialize encoding map type");
15142 }
15143 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15144 return _Py_INIT_ERR("Can't initialize field name iterator type");
15145 }
15146 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15147 return _Py_INIT_ERR("Can't initialize formatter iter type");
15148 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015149 return _Py_INIT_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015150}
15151
15152/* Finalize the Unicode implementation */
15153
Christian Heimesa156e092008-02-16 07:38:31 +000015154int
15155PyUnicode_ClearFreeList(void)
15156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015157 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015158}
15159
Guido van Rossumd57fd912000-03-10 22:53:23 +000015160void
Thomas Wouters78890102000-07-22 19:25:51 +000015161_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015162{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015163 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164
Serhiy Storchaka05997252013-01-26 12:14:02 +020015165 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015166
Serhiy Storchaka05997252013-01-26 12:14:02 +020015167 for (i = 0; i < 256; i++)
15168 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015169 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015170 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015171}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015172
Walter Dörwald16807132007-05-25 13:52:07 +000015173void
15174PyUnicode_InternInPlace(PyObject **p)
15175{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015176 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015178#ifdef Py_DEBUG
15179 assert(s != NULL);
15180 assert(_PyUnicode_CHECK(s));
15181#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015183 return;
15184#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 /* If it's a subclass, we don't really know what putting
15186 it in the interned dict might do. */
15187 if (!PyUnicode_CheckExact(s))
15188 return;
15189 if (PyUnicode_CHECK_INTERNED(s))
15190 return;
15191 if (interned == NULL) {
15192 interned = PyDict_New();
15193 if (interned == NULL) {
15194 PyErr_Clear(); /* Don't leave an exception */
15195 return;
15196 }
15197 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015199 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015200 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015201 if (t == NULL) {
15202 PyErr_Clear();
15203 return;
15204 }
15205 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015206 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015207 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015208 return;
15209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015210 /* The two references in interned are not counted by refcnt.
15211 The deallocator will take care of this */
15212 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015213 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015214}
15215
15216void
15217PyUnicode_InternImmortal(PyObject **p)
15218{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 PyUnicode_InternInPlace(p);
15220 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015221 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015222 Py_INCREF(*p);
15223 }
Walter Dörwald16807132007-05-25 13:52:07 +000015224}
15225
15226PyObject *
15227PyUnicode_InternFromString(const char *cp)
15228{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 PyObject *s = PyUnicode_FromString(cp);
15230 if (s == NULL)
15231 return NULL;
15232 PyUnicode_InternInPlace(&s);
15233 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015234}
15235
Alexander Belopolsky40018472011-02-26 01:02:56 +000015236void
15237_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015240 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 Py_ssize_t i, n;
15242 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015243
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 if (interned == NULL || !PyDict_Check(interned))
15245 return;
15246 keys = PyDict_Keys(interned);
15247 if (keys == NULL || !PyList_Check(keys)) {
15248 PyErr_Clear();
15249 return;
15250 }
Walter Dörwald16807132007-05-25 13:52:07 +000015251
Benjamin Peterson14339b62009-01-31 16:36:08 +000015252 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15253 detector, interned unicode strings are not forcibly deallocated;
15254 rather, we give them their stolen references back, and then clear
15255 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015256
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 n = PyList_GET_SIZE(keys);
15258 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015259 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015261 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015262 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015263 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015265 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 case SSTATE_NOT_INTERNED:
15267 /* XXX Shouldn't happen */
15268 break;
15269 case SSTATE_INTERNED_IMMORTAL:
15270 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015271 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 break;
15273 case SSTATE_INTERNED_MORTAL:
15274 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015275 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 break;
15277 default:
15278 Py_FatalError("Inconsistent interned string state.");
15279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015280 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 }
15282 fprintf(stderr, "total size of all interned strings: "
15283 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15284 "mortal/immortal\n", mortal_size, immortal_size);
15285 Py_DECREF(keys);
15286 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015287 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015288}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015289
15290
15291/********************* Unicode Iterator **************************/
15292
15293typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 PyObject_HEAD
15295 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015296 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015297} unicodeiterobject;
15298
15299static void
15300unicodeiter_dealloc(unicodeiterobject *it)
15301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 _PyObject_GC_UNTRACK(it);
15303 Py_XDECREF(it->it_seq);
15304 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015305}
15306
15307static int
15308unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 Py_VISIT(it->it_seq);
15311 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015312}
15313
15314static PyObject *
15315unicodeiter_next(unicodeiterobject *it)
15316{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015317 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015318
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 assert(it != NULL);
15320 seq = it->it_seq;
15321 if (seq == NULL)
15322 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015323 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015325 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15326 int kind = PyUnicode_KIND(seq);
15327 void *data = PyUnicode_DATA(seq);
15328 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15329 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015330 if (item != NULL)
15331 ++it->it_index;
15332 return item;
15333 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015334
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015336 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015338}
15339
15340static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015341unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 Py_ssize_t len = 0;
15344 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015345 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015347}
15348
15349PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15350
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015351static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015352unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015353{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015354 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015355 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015356 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015357 it->it_seq, it->it_index);
15358 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015359 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015360 if (u == NULL)
15361 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015362 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015363 }
15364}
15365
15366PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15367
15368static PyObject *
15369unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15370{
15371 Py_ssize_t index = PyLong_AsSsize_t(state);
15372 if (index == -1 && PyErr_Occurred())
15373 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015374 if (it->it_seq != NULL) {
15375 if (index < 0)
15376 index = 0;
15377 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15378 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15379 it->it_index = index;
15380 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015381 Py_RETURN_NONE;
15382}
15383
15384PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15385
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015388 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015389 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15390 reduce_doc},
15391 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15392 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015394};
15395
15396PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15398 "str_iterator", /* tp_name */
15399 sizeof(unicodeiterobject), /* tp_basicsize */
15400 0, /* tp_itemsize */
15401 /* methods */
15402 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15403 0, /* tp_print */
15404 0, /* tp_getattr */
15405 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015406 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 0, /* tp_repr */
15408 0, /* tp_as_number */
15409 0, /* tp_as_sequence */
15410 0, /* tp_as_mapping */
15411 0, /* tp_hash */
15412 0, /* tp_call */
15413 0, /* tp_str */
15414 PyObject_GenericGetAttr, /* tp_getattro */
15415 0, /* tp_setattro */
15416 0, /* tp_as_buffer */
15417 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15418 0, /* tp_doc */
15419 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15420 0, /* tp_clear */
15421 0, /* tp_richcompare */
15422 0, /* tp_weaklistoffset */
15423 PyObject_SelfIter, /* tp_iter */
15424 (iternextfunc)unicodeiter_next, /* tp_iternext */
15425 unicodeiter_methods, /* tp_methods */
15426 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427};
15428
15429static PyObject *
15430unicode_iter(PyObject *seq)
15431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015433
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 if (!PyUnicode_Check(seq)) {
15435 PyErr_BadInternalCall();
15436 return NULL;
15437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438 if (PyUnicode_READY(seq) == -1)
15439 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15441 if (it == NULL)
15442 return NULL;
15443 it->it_index = 0;
15444 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015445 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 _PyObject_GC_TRACK(it);
15447 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015448}
15449
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015450
15451size_t
15452Py_UNICODE_strlen(const Py_UNICODE *u)
15453{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015454 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015455}
15456
15457Py_UNICODE*
15458Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15459{
15460 Py_UNICODE *u = s1;
15461 while ((*u++ = *s2++));
15462 return s1;
15463}
15464
15465Py_UNICODE*
15466Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15467{
15468 Py_UNICODE *u = s1;
15469 while ((*u++ = *s2++))
15470 if (n-- == 0)
15471 break;
15472 return s1;
15473}
15474
15475Py_UNICODE*
15476Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15477{
15478 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015479 u1 += wcslen(u1);
15480 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015481 return s1;
15482}
15483
15484int
15485Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15486{
15487 while (*s1 && *s2 && *s1 == *s2)
15488 s1++, s2++;
15489 if (*s1 && *s2)
15490 return (*s1 < *s2) ? -1 : +1;
15491 if (*s1)
15492 return 1;
15493 if (*s2)
15494 return -1;
15495 return 0;
15496}
15497
15498int
15499Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15500{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015501 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015502 for (; n != 0; n--) {
15503 u1 = *s1;
15504 u2 = *s2;
15505 if (u1 != u2)
15506 return (u1 < u2) ? -1 : +1;
15507 if (u1 == '\0')
15508 return 0;
15509 s1++;
15510 s2++;
15511 }
15512 return 0;
15513}
15514
15515Py_UNICODE*
15516Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15517{
15518 const Py_UNICODE *p;
15519 for (p = s; *p; p++)
15520 if (*p == c)
15521 return (Py_UNICODE*)p;
15522 return NULL;
15523}
15524
15525Py_UNICODE*
15526Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15527{
15528 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015529 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015530 while (p != s) {
15531 p--;
15532 if (*p == c)
15533 return (Py_UNICODE*)p;
15534 }
15535 return NULL;
15536}
Victor Stinner331ea922010-08-10 16:37:20 +000015537
Victor Stinner71133ff2010-09-01 23:43:53 +000015538Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015539PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015540{
Victor Stinner577db2c2011-10-11 22:12:48 +020015541 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015542 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015544 if (!PyUnicode_Check(unicode)) {
15545 PyErr_BadArgument();
15546 return NULL;
15547 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015548 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015549 if (u == NULL)
15550 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015551 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015552 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015553 PyErr_NoMemory();
15554 return NULL;
15555 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015556 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015557 size *= sizeof(Py_UNICODE);
15558 copy = PyMem_Malloc(size);
15559 if (copy == NULL) {
15560 PyErr_NoMemory();
15561 return NULL;
15562 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015563 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015564 return copy;
15565}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015566
Georg Brandl66c221e2010-10-14 07:04:07 +000015567/* A _string module, to export formatter_parser and formatter_field_name_split
15568 to the string.Formatter class implemented in Python. */
15569
15570static PyMethodDef _string_methods[] = {
15571 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15572 METH_O, PyDoc_STR("split the argument as a field name")},
15573 {"formatter_parser", (PyCFunction) formatter_parser,
15574 METH_O, PyDoc_STR("parse the argument as a format string")},
15575 {NULL, NULL}
15576};
15577
15578static struct PyModuleDef _string_module = {
15579 PyModuleDef_HEAD_INIT,
15580 "_string",
15581 PyDoc_STR("string helper module"),
15582 0,
15583 _string_methods,
15584 NULL,
15585 NULL,
15586 NULL,
15587 NULL
15588};
15589
15590PyMODINIT_FUNC
15591PyInit__string(void)
15592{
15593 return PyModule_Create(&_string_module);
15594}
15595
15596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015597#ifdef __cplusplus
15598}
15599#endif