blob: 01049f54e89826f0c8b17ea545e1c237708b32e4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700224static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200225_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
226
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200228static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200229
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230/* Single character Unicode strings in the Latin-1 range are being
231 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200232static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Christian Heimes190d79e2008-01-30 11:58:22 +0000234/* Fast detection of the most frequent whitespace characters */
235const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000239/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000240/* case 0x000C: * FORM FEED */
241/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000242 0, 1, 1, 1, 1, 1, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000244/* case 0x001C: * FILE SEPARATOR */
245/* case 0x001D: * GROUP SEPARATOR */
246/* case 0x001E: * RECORD SEPARATOR */
247/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000250 1, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000254
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000263};
264
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200266static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200267static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100268static int unicode_modifiable(PyObject *unicode);
269
Victor Stinnerfe226c02011-10-03 03:52:20 +0200270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100272_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200273static PyObject *
274_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
275static PyObject *
276_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
277
278static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000279unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100281 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000282 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
283
Alexander Belopolsky40018472011-02-26 01:02:56 +0000284static void
285raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300286 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100287 PyObject *unicode,
288 Py_ssize_t startpos, Py_ssize_t endpos,
289 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000290
Christian Heimes190d79e2008-01-30 11:58:22 +0000291/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200292static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295/* 0x000B, * LINE TABULATION */
296/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000298 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* 0x001C, * FILE SEPARATOR */
301/* 0x001D, * GROUP SEPARATOR */
302/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 0, 0, 0, 0, 1, 1, 1, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000308
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000317};
318
INADA Naoki3ae20562017-01-16 20:41:20 +0900319static int convert_uc(PyObject *obj, void *addr);
320
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300321#include "clinic/unicodeobject.c.h"
322
Victor Stinner3d4226a2018-08-29 22:21:32 +0200323_Py_error_handler
324_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200325{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200326 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200327 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200328 }
329 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200330 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200331 }
332 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200333 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200334 }
335 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200336 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200339 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200342 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200345 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
Victor Stinner50149202015-09-22 00:26:54 +0200347 return _Py_ERROR_OTHER;
348}
349
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300350/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
351 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000352Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000353PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000355#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000357#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 /* This is actually an illegal character, so it should
359 not be passed to unichr. */
360 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361#endif
362}
363
Victor Stinner910337b2011-10-03 03:20:16 +0200364#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200365int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100366_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200367{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200368#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
369
Victor Stinner910337b2011-10-03 03:20:16 +0200370 PyASCIIObject *ascii;
371 unsigned int kind;
372
Victor Stinner50fe3f82018-10-26 18:47:15 +0200373 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200374
375 ascii = (PyASCIIObject *)op;
376 kind = ascii->state.kind;
377
Victor Stinnera3b334d2011-10-03 13:53:37 +0200378 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200379 ASSERT(kind == PyUnicode_1BYTE_KIND);
380 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200383 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200385
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 if (ascii->state.compact == 1) {
387 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200388 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200389 || kind == PyUnicode_2BYTE_KIND
390 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200391 ASSERT(ascii->state.ascii == 0);
392 ASSERT(ascii->state.ready == 1);
393 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100394 }
395 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200396 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
397
398 data = unicode->data.any;
399 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200400 ASSERT(ascii->length == 0);
401 ASSERT(ascii->hash == -1);
402 ASSERT(ascii->state.compact == 0);
403 ASSERT(ascii->state.ascii == 0);
404 ASSERT(ascii->state.ready == 0);
405 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
406 ASSERT(ascii->wstr != NULL);
407 ASSERT(data == NULL);
408 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 }
410 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200411 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200412 || kind == PyUnicode_2BYTE_KIND
413 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(ascii->state.compact == 0);
415 ASSERT(ascii->state.ready == 1);
416 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200418 ASSERT (compact->utf8 == data);
419 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200420 }
421 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200422 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 }
424 }
425 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200426 if (
427#if SIZEOF_WCHAR_T == 2
428 kind == PyUnicode_2BYTE_KIND
429#else
430 kind == PyUnicode_4BYTE_KIND
431#endif
432 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200433 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200434 ASSERT(ascii->wstr == data);
435 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200436 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200437 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200438 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200439
440 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200441 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200442 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200443 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200444 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 /* check that the best kind is used */
446 if (check_content && kind != PyUnicode_WCHAR_KIND)
447 {
448 Py_ssize_t i;
449 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200450 void *data;
451 Py_UCS4 ch;
452
453 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200454 for (i=0; i < ascii->length; i++)
455 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200456 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 if (ch > maxchar)
458 maxchar = ch;
459 }
460 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100461 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200462 ASSERT(maxchar >= 128);
463 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100464 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200466 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200467 }
Victor Stinner77faf692011-11-20 18:56:05 +0100468 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(maxchar >= 0x100);
470 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 }
472 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200473 ASSERT(maxchar >= 0x10000);
474 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100475 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400478 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200479
480#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400481}
Victor Stinner910337b2011-10-03 03:20:16 +0200482#endif
483
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100484static PyObject*
485unicode_result_wchar(PyObject *unicode)
486{
487#ifndef Py_DEBUG
488 Py_ssize_t len;
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490 len = _PyUnicode_WSTR_LENGTH(unicode);
491 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100492 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200493 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 }
495
496 if (len == 1) {
497 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100498 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
500 Py_DECREF(unicode);
501 return latin1_char;
502 }
503 }
504
505 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200506 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100507 return NULL;
508 }
509#else
Victor Stinneraa771272012-10-04 02:32:58 +0200510 assert(Py_REFCNT(unicode) == 1);
511
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 /* don't make the result ready in debug mode to ensure that the caller
513 makes the string ready before using it */
514 assert(_PyUnicode_CheckConsistency(unicode, 1));
515#endif
516 return unicode;
517}
518
519static PyObject*
520unicode_result_ready(PyObject *unicode)
521{
522 Py_ssize_t length;
523
524 length = PyUnicode_GET_LENGTH(unicode);
525 if (length == 0) {
526 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530 return unicode_empty;
531 }
532
533 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200534 void *data = PyUnicode_DATA(unicode);
535 int kind = PyUnicode_KIND(unicode);
536 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100537 if (ch < 256) {
538 PyObject *latin1_char = unicode_latin1[ch];
539 if (latin1_char != NULL) {
540 if (unicode != latin1_char) {
541 Py_INCREF(latin1_char);
542 Py_DECREF(unicode);
543 }
544 return latin1_char;
545 }
546 else {
547 assert(_PyUnicode_CheckConsistency(unicode, 1));
548 Py_INCREF(unicode);
549 unicode_latin1[ch] = unicode;
550 return unicode;
551 }
552 }
553 }
554
555 assert(_PyUnicode_CheckConsistency(unicode, 1));
556 return unicode;
557}
558
559static PyObject*
560unicode_result(PyObject *unicode)
561{
562 assert(_PyUnicode_CHECK(unicode));
563 if (PyUnicode_IS_READY(unicode))
564 return unicode_result_ready(unicode);
565 else
566 return unicode_result_wchar(unicode);
567}
568
Victor Stinnerc4b49542011-12-11 22:44:26 +0100569static PyObject*
570unicode_result_unchanged(PyObject *unicode)
571{
572 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500573 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574 return NULL;
575 Py_INCREF(unicode);
576 return unicode;
577 }
578 else
579 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100580 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100581}
582
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200583/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
584 ASCII, Latin1, UTF-8, etc. */
585static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200586backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200587 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
588{
Victor Stinnerad771582015-10-09 12:38:53 +0200589 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200590 Py_UCS4 ch;
591 enum PyUnicode_Kind kind;
592 void *data;
593
594 assert(PyUnicode_IS_READY(unicode));
595 kind = PyUnicode_KIND(unicode);
596 data = PyUnicode_DATA(unicode);
597
598 size = 0;
599 /* determine replacement size */
600 for (i = collstart; i < collend; ++i) {
601 Py_ssize_t incr;
602
603 ch = PyUnicode_READ(kind, data, i);
604 if (ch < 0x100)
605 incr = 2+2;
606 else if (ch < 0x10000)
607 incr = 2+4;
608 else {
609 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200610 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200611 }
612 if (size > PY_SSIZE_T_MAX - incr) {
613 PyErr_SetString(PyExc_OverflowError,
614 "encoded result is too long for a Python string");
615 return NULL;
616 }
617 size += incr;
618 }
619
Victor Stinnerad771582015-10-09 12:38:53 +0200620 str = _PyBytesWriter_Prepare(writer, str, size);
621 if (str == NULL)
622 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200623
624 /* generate replacement */
625 for (i = collstart; i < collend; ++i) {
626 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200627 *str++ = '\\';
628 if (ch >= 0x00010000) {
629 *str++ = 'U';
630 *str++ = Py_hexdigits[(ch>>28)&0xf];
631 *str++ = Py_hexdigits[(ch>>24)&0xf];
632 *str++ = Py_hexdigits[(ch>>20)&0xf];
633 *str++ = Py_hexdigits[(ch>>16)&0xf];
634 *str++ = Py_hexdigits[(ch>>12)&0xf];
635 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 }
Victor Stinner797485e2015-10-09 03:17:30 +0200637 else if (ch >= 0x100) {
638 *str++ = 'u';
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
641 }
642 else
643 *str++ = 'x';
644 *str++ = Py_hexdigits[(ch>>4)&0xf];
645 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 return str;
648}
649
650/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
651 ASCII, Latin1, UTF-8, etc. */
652static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200653xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200654 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
655{
Victor Stinnerad771582015-10-09 12:38:53 +0200656 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200657 Py_UCS4 ch;
658 enum PyUnicode_Kind kind;
659 void *data;
660
661 assert(PyUnicode_IS_READY(unicode));
662 kind = PyUnicode_KIND(unicode);
663 data = PyUnicode_DATA(unicode);
664
665 size = 0;
666 /* determine replacement size */
667 for (i = collstart; i < collend; ++i) {
668 Py_ssize_t incr;
669
670 ch = PyUnicode_READ(kind, data, i);
671 if (ch < 10)
672 incr = 2+1+1;
673 else if (ch < 100)
674 incr = 2+2+1;
675 else if (ch < 1000)
676 incr = 2+3+1;
677 else if (ch < 10000)
678 incr = 2+4+1;
679 else if (ch < 100000)
680 incr = 2+5+1;
681 else if (ch < 1000000)
682 incr = 2+6+1;
683 else {
684 assert(ch <= MAX_UNICODE);
685 incr = 2+7+1;
686 }
687 if (size > PY_SSIZE_T_MAX - incr) {
688 PyErr_SetString(PyExc_OverflowError,
689 "encoded result is too long for a Python string");
690 return NULL;
691 }
692 size += incr;
693 }
694
Victor Stinnerad771582015-10-09 12:38:53 +0200695 str = _PyBytesWriter_Prepare(writer, str, size);
696 if (str == NULL)
697 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200698
699 /* generate replacement */
700 for (i = collstart; i < collend; ++i) {
701 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
702 }
703 return str;
704}
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706/* --- Bloom Filters ----------------------------------------------------- */
707
708/* stuff to implement simple "bloom filters" for Unicode characters.
709 to keep things simple, we use a single bitmask, using the least 5
710 bits from each unicode characters as the bit index. */
711
712/* the linebreak mask is set up by Unicode_Init below */
713
Antoine Pitrouf068f942010-01-13 14:19:12 +0000714#if LONG_BIT >= 128
715#define BLOOM_WIDTH 128
716#elif LONG_BIT >= 64
717#define BLOOM_WIDTH 64
718#elif LONG_BIT >= 32
719#define BLOOM_WIDTH 32
720#else
721#error "LONG_BIT is smaller than 32"
722#endif
723
Thomas Wouters477c8d52006-05-27 19:21:47 +0000724#define BLOOM_MASK unsigned long
725
Serhiy Storchaka05997252013-01-26 12:14:02 +0200726static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000727
Antoine Pitrouf068f942010-01-13 14:19:12 +0000728#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729
Benjamin Peterson29060642009-01-31 22:14:21 +0000730#define BLOOM_LINEBREAK(ch) \
731 ((ch) < 128U ? ascii_linebreak[(ch)] : \
732 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700734static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000736{
Victor Stinnera85af502013-04-09 21:53:54 +0200737#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
738 do { \
739 TYPE *data = (TYPE *)PTR; \
740 TYPE *end = data + LEN; \
741 Py_UCS4 ch; \
742 for (; data != end; data++) { \
743 ch = *data; \
744 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
745 } \
746 break; \
747 } while (0)
748
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 /* calculate simple bloom-style bitmask for a given unicode string */
750
Antoine Pitrouf068f942010-01-13 14:19:12 +0000751 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752
753 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200754 switch (kind) {
755 case PyUnicode_1BYTE_KIND:
756 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
757 break;
758 case PyUnicode_2BYTE_KIND:
759 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
760 break;
761 case PyUnicode_4BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
763 break;
764 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700765 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200766 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000767 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200768
769#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000770}
771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300772static int
773ensure_unicode(PyObject *obj)
774{
775 if (!PyUnicode_Check(obj)) {
776 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200777 "must be str, not %.100s",
778 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300779 return -1;
780 }
781 return PyUnicode_READY(obj);
782}
783
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200784/* Compilation of templated routines */
785
786#include "stringlib/asciilib.h"
787#include "stringlib/fastsearch.h"
788#include "stringlib/partition.h"
789#include "stringlib/split.h"
790#include "stringlib/count.h"
791#include "stringlib/find.h"
792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs1lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs2lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
818#include "stringlib/ucs4lib.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/partition.h"
821#include "stringlib/split.h"
822#include "stringlib/count.h"
823#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300824#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200825#include "stringlib/find_max_char.h"
826#include "stringlib/localeutil.h"
827#include "stringlib/undef.h"
828
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200829#include "stringlib/unicodedefs.h"
830#include "stringlib/fastsearch.h"
831#include "stringlib/count.h"
832#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100833#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835/* --- Unicode Object ----------------------------------------------------- */
836
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700837static inline Py_ssize_t
838findchar(const void *s, int kind,
839 Py_ssize_t size, Py_UCS4 ch,
840 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 switch (kind) {
843 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200844 if ((Py_UCS1) ch != ch)
845 return -1;
846 if (direction > 0)
847 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
848 else
849 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200851 if ((Py_UCS2) ch != ch)
852 return -1;
853 if (direction > 0)
854 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
855 else
856 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200857 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200858 if (direction > 0)
859 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
860 else
861 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200862 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700863 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865}
866
Victor Stinnerafffce42012-10-03 23:03:17 +0200867#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000868/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200869 earlier.
870
871 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
872 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
873 invalid character in Unicode 6.0. */
874static void
875unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
876{
877 int kind = PyUnicode_KIND(unicode);
878 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
879 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
880 if (length <= old_length)
881 return;
882 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
883}
884#endif
885
Victor Stinnerfe226c02011-10-03 03:52:20 +0200886static PyObject*
887resize_compact(PyObject *unicode, Py_ssize_t length)
888{
889 Py_ssize_t char_size;
890 Py_ssize_t struct_size;
891 Py_ssize_t new_size;
892 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100893 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200894#ifdef Py_DEBUG
895 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
896#endif
897
Victor Stinner79891572012-05-03 13:43:07 +0200898 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200899 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100900 assert(PyUnicode_IS_COMPACT(unicode));
901
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200902 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100903 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200904 struct_size = sizeof(PyASCIIObject);
905 else
906 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200907 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908
Victor Stinnerfe226c02011-10-03 03:52:20 +0200909 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
910 PyErr_NoMemory();
911 return NULL;
912 }
913 new_size = (struct_size + (length + 1) * char_size);
914
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200915 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
916 PyObject_DEL(_PyUnicode_UTF8(unicode));
917 _PyUnicode_UTF8(unicode) = NULL;
918 _PyUnicode_UTF8_LENGTH(unicode) = 0;
919 }
Victor Stinner84def372011-12-11 20:04:56 +0100920 _Py_DEC_REFTOTAL;
921 _Py_ForgetReference(unicode);
922
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300923 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100924 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100925 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 PyErr_NoMemory();
927 return NULL;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100931
Victor Stinnerfe226c02011-10-03 03:52:20 +0200932 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200933 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100935 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200936 _PyUnicode_WSTR_LENGTH(unicode) = length;
937 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100938 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
939 PyObject_DEL(_PyUnicode_WSTR(unicode));
940 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100941 if (!PyUnicode_IS_ASCII(unicode))
942 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100943 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200944#ifdef Py_DEBUG
945 unicode_fill_invalid(unicode, old_length);
946#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200947 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
948 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200949 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200950 return unicode;
951}
952
Alexander Belopolsky40018472011-02-26 01:02:56 +0000953static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200954resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955{
Victor Stinner95663112011-10-04 01:03:50 +0200956 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100957 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000960
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 if (PyUnicode_IS_READY(unicode)) {
962 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200963 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200964 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200965#ifdef Py_DEBUG
966 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
967#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968
969 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200970 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200971 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
972 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973
974 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
975 PyErr_NoMemory();
976 return -1;
977 }
978 new_size = (length + 1) * char_size;
979
Victor Stinner7a9105a2011-12-12 00:13:42 +0100980 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
981 {
982 PyObject_DEL(_PyUnicode_UTF8(unicode));
983 _PyUnicode_UTF8(unicode) = NULL;
984 _PyUnicode_UTF8_LENGTH(unicode) = 0;
985 }
986
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 data = (PyObject *)PyObject_REALLOC(data, new_size);
988 if (data == NULL) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200993 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200995 _PyUnicode_WSTR_LENGTH(unicode) = length;
996 }
997 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200998 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200999 _PyUnicode_UTF8_LENGTH(unicode) = length;
1000 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
1002 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001003#ifdef Py_DEBUG
1004 unicode_fill_invalid(unicode, old_length);
1005#endif
Victor Stinner95663112011-10-04 01:03:50 +02001006 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001007 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 }
Victor Stinner95663112011-10-04 01:03:50 +02001011 assert(_PyUnicode_WSTR(unicode) != NULL);
1012
1013 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001014 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001015 PyErr_NoMemory();
1016 return -1;
1017 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001018 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001019 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001020 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001021 if (!wstr) {
1022 PyErr_NoMemory();
1023 return -1;
1024 }
1025 _PyUnicode_WSTR(unicode) = wstr;
1026 _PyUnicode_WSTR(unicode)[length] = 0;
1027 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001028 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 return 0;
1030}
1031
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032static PyObject*
1033resize_copy(PyObject *unicode, Py_ssize_t length)
1034{
1035 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001036 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001038
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001039 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040
1041 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1042 if (copy == NULL)
1043 return NULL;
1044
1045 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001046 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001048 }
1049 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001050 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (w == NULL)
1054 return NULL;
1055 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1056 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001057 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001058 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 }
1061}
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001064 Ux0000 terminated; some code (e.g. new_identifier)
1065 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066
1067 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001068 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069
1070*/
1071
Alexander Belopolsky40018472011-02-26 01:02:56 +00001072static PyUnicodeObject *
1073_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001075 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077
Thomas Wouters477c8d52006-05-27 19:21:47 +00001078 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 if (length == 0 && unicode_empty != NULL) {
1080 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001081 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082 }
1083
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001084 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001085 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001086 return (PyUnicodeObject *)PyErr_NoMemory();
1087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 if (length < 0) {
1089 PyErr_SetString(PyExc_SystemError,
1090 "Negative size passed to _PyUnicode_New");
1091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
1093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1095 if (unicode == NULL)
1096 return NULL;
1097 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001098
1099 _PyUnicode_WSTR_LENGTH(unicode) = length;
1100 _PyUnicode_HASH(unicode) = -1;
1101 _PyUnicode_STATE(unicode).interned = 0;
1102 _PyUnicode_STATE(unicode).kind = 0;
1103 _PyUnicode_STATE(unicode).compact = 0;
1104 _PyUnicode_STATE(unicode).ready = 0;
1105 _PyUnicode_STATE(unicode).ascii = 0;
1106 _PyUnicode_DATA_ANY(unicode) = NULL;
1107 _PyUnicode_LENGTH(unicode) = 0;
1108 _PyUnicode_UTF8(unicode) = NULL;
1109 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1112 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001113 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001114 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001115 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117
Jeremy Hyltond8082792003-09-16 19:41:39 +00001118 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001119 * the caller fails before initializing str -- unicode_resize()
1120 * reads str[0], and the Keep-Alive optimization can keep memory
1121 * allocated for str alive across a call to unicode_dealloc(unicode).
1122 * We don't want unicode_resize to read uninitialized memory in
1123 * that case.
1124 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 _PyUnicode_WSTR(unicode)[0] = 0;
1126 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001127
Victor Stinner7931d9a2011-11-04 00:22:48 +01001128 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 return unicode;
1130}
1131
Victor Stinnerf42dc442011-10-02 23:33:16 +02001132static const char*
1133unicode_kind_name(PyObject *unicode)
1134{
Victor Stinner42dfd712011-10-03 14:41:45 +02001135 /* don't check consistency: unicode_kind_name() is called from
1136 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001137 if (!PyUnicode_IS_COMPACT(unicode))
1138 {
1139 if (!PyUnicode_IS_READY(unicode))
1140 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001141 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001142 {
1143 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001144 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001145 return "legacy ascii";
1146 else
1147 return "legacy latin1";
1148 case PyUnicode_2BYTE_KIND:
1149 return "legacy UCS2";
1150 case PyUnicode_4BYTE_KIND:
1151 return "legacy UCS4";
1152 default:
1153 return "<legacy invalid kind>";
1154 }
1155 }
1156 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001157 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 return "ascii";
1161 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001162 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001164 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001165 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001166 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 default:
1168 return "<invalid compact kind>";
1169 }
1170}
1171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001174char *_PyUnicode_utf8(void *unicode_raw){
1175 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001176 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177}
1178
Victor Stinnera42de742018-11-22 10:25:22 +01001179void *_PyUnicode_compact_data(void *unicode_raw) {
1180 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 return _PyUnicode_COMPACT_DATA(unicode);
1182}
Victor Stinnera42de742018-11-22 10:25:22 +01001183void *_PyUnicode_data(void *unicode_raw) {
1184 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 printf("obj %p\n", unicode);
1186 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1187 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1188 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1189 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1190 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1191 return PyUnicode_DATA(unicode);
1192}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001193
1194void
1195_PyUnicode_Dump(PyObject *op)
1196{
1197 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001198 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1199 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1200 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001201
Victor Stinnera849a4b2011-10-03 12:12:11 +02001202 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001203 {
1204 if (ascii->state.ascii)
1205 data = (ascii + 1);
1206 else
1207 data = (compact + 1);
1208 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001209 else
1210 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001211 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1212 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001213
Victor Stinnera849a4b2011-10-03 12:12:11 +02001214 if (ascii->wstr == data)
1215 printf("shared ");
1216 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001217
Victor Stinnera3b334d2011-10-03 13:53:37 +02001218 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001219 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1221 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001222 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1223 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001225 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001226}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227#endif
1228
1229PyObject *
1230PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1231{
1232 PyObject *obj;
1233 PyCompactUnicodeObject *unicode;
1234 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001235 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 Py_ssize_t char_size;
1238 Py_ssize_t struct_size;
1239
1240 /* Optimization for empty strings */
1241 if (size == 0 && unicode_empty != NULL) {
1242 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001243 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 }
1245
Victor Stinner9e9d6892011-10-04 01:02:02 +02001246 is_ascii = 0;
1247 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 struct_size = sizeof(PyCompactUnicodeObject);
1249 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 1;
1252 is_ascii = 1;
1253 struct_size = sizeof(PyASCIIObject);
1254 }
1255 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 }
1259 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001260 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001261 char_size = 2;
1262 if (sizeof(wchar_t) == 2)
1263 is_sharing = 1;
1264 }
1265 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001266 if (maxchar > MAX_UNICODE) {
1267 PyErr_SetString(PyExc_SystemError,
1268 "invalid maximum character passed to PyUnicode_New");
1269 return NULL;
1270 }
Victor Stinner8f825062012-04-27 13:55:39 +02001271 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 char_size = 4;
1273 if (sizeof(wchar_t) == 4)
1274 is_sharing = 1;
1275 }
1276
1277 /* Ensure we won't overflow the size. */
1278 if (size < 0) {
1279 PyErr_SetString(PyExc_SystemError,
1280 "Negative size passed to PyUnicode_New");
1281 return NULL;
1282 }
1283 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1284 return PyErr_NoMemory();
1285
1286 /* Duplicated allocation code from _PyObject_New() instead of a call to
1287 * PyObject_New() so we are able to allocate space for the object and
1288 * it's data buffer.
1289 */
1290 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1291 if (obj == NULL)
1292 return PyErr_NoMemory();
1293 obj = PyObject_INIT(obj, &PyUnicode_Type);
1294 if (obj == NULL)
1295 return NULL;
1296
1297 unicode = (PyCompactUnicodeObject *)obj;
1298 if (is_ascii)
1299 data = ((PyASCIIObject*)obj) + 1;
1300 else
1301 data = unicode + 1;
1302 _PyUnicode_LENGTH(unicode) = size;
1303 _PyUnicode_HASH(unicode) = -1;
1304 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001305 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 _PyUnicode_STATE(unicode).compact = 1;
1307 _PyUnicode_STATE(unicode).ready = 1;
1308 _PyUnicode_STATE(unicode).ascii = is_ascii;
1309 if (is_ascii) {
1310 ((char*)data)[size] = 0;
1311 _PyUnicode_WSTR(unicode) = NULL;
1312 }
Victor Stinner8f825062012-04-27 13:55:39 +02001313 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((char*)data)[size] = 0;
1315 _PyUnicode_WSTR(unicode) = NULL;
1316 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001318 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 else {
1321 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001322 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001323 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 ((Py_UCS4*)data)[size] = 0;
1327 if (is_sharing) {
1328 _PyUnicode_WSTR_LENGTH(unicode) = size;
1329 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1330 }
1331 else {
1332 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1333 _PyUnicode_WSTR(unicode) = NULL;
1334 }
1335 }
Victor Stinner8f825062012-04-27 13:55:39 +02001336#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001337 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001338#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001339 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 return obj;
1341}
1342
1343#if SIZEOF_WCHAR_T == 2
1344/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1345 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001346 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347
1348 This function assumes that unicode can hold one more code point than wstr
1349 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001350static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001352 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353{
1354 const wchar_t *iter;
1355 Py_UCS4 *ucs4_out;
1356
Victor Stinner910337b2011-10-03 03:20:16 +02001357 assert(unicode != NULL);
1358 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1360 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1361
1362 for (iter = begin; iter < end; ) {
1363 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1364 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001365 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1366 && (iter+1) < end
1367 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 {
Victor Stinner551ac952011-11-29 22:58:13 +01001369 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else {
1373 *ucs4_out++ = *iter;
1374 iter++;
1375 }
1376 }
1377 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1378 _PyUnicode_GET_LENGTH(unicode)));
1379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380}
1381#endif
1382
Victor Stinnercd9950f2011-10-02 00:34:53 +02001383static int
Victor Stinner488fa492011-12-12 00:01:39 +01001384unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001385{
Victor Stinner488fa492011-12-12 00:01:39 +01001386 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001387 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001388 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389 return -1;
1390 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391 return 0;
1392}
1393
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394static int
1395_copy_characters(PyObject *to, Py_ssize_t to_start,
1396 PyObject *from, Py_ssize_t from_start,
1397 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001399 unsigned int from_kind, to_kind;
1400 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401
Victor Stinneree4544c2012-05-09 22:24:08 +02001402 assert(0 <= how_many);
1403 assert(0 <= from_start);
1404 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001405 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001406 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001407 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerd3f08822012-05-29 12:57:52 +02001409 assert(PyUnicode_Check(to));
1410 assert(PyUnicode_IS_READY(to));
1411 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1412
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001413 if (how_many == 0)
1414 return 0;
1415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001417 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001419 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
Victor Stinnerf1852262012-06-16 16:38:26 +02001421#ifdef Py_DEBUG
1422 if (!check_maxchar
1423 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1424 {
1425 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1426 Py_UCS4 ch;
1427 Py_ssize_t i;
1428 for (i=0; i < how_many; i++) {
1429 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1430 assert(ch <= to_maxchar);
1431 }
1432 }
1433#endif
1434
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001435 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001436 if (check_maxchar
1437 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1438 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001439 /* Writing Latin-1 characters into an ASCII string requires to
1440 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001441 Py_UCS4 max_char;
1442 max_char = ucs1lib_find_max_char(from_data,
1443 (Py_UCS1*)from_data + how_many);
1444 if (max_char >= 128)
1445 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001446 }
Christian Heimesf051e432016-09-13 20:22:02 +02001447 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001448 (char*)from_data + from_kind * from_start,
1449 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
1452 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS2,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_2BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001461 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS1, Py_UCS4,
1466 PyUnicode_1BYTE_DATA(from) + from_start,
1467 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
1471 else if (from_kind == PyUnicode_2BYTE_KIND
1472 && to_kind == PyUnicode_4BYTE_KIND)
1473 {
1474 _PyUnicode_CONVERT_BYTES(
1475 Py_UCS2, Py_UCS4,
1476 PyUnicode_2BYTE_DATA(from) + from_start,
1477 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1478 PyUnicode_4BYTE_DATA(to) + to_start
1479 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001480 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001481 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001482 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1483
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001484 if (!check_maxchar) {
1485 if (from_kind == PyUnicode_2BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS2, Py_UCS1,
1490 PyUnicode_2BYTE_DATA(from) + from_start,
1491 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_1BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS1,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_1BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else if (from_kind == PyUnicode_4BYTE_KIND
1506 && to_kind == PyUnicode_2BYTE_KIND)
1507 {
1508 _PyUnicode_CONVERT_BYTES(
1509 Py_UCS4, Py_UCS2,
1510 PyUnicode_4BYTE_DATA(from) + from_start,
1511 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1512 PyUnicode_2BYTE_DATA(to) + to_start
1513 );
1514 }
1515 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001516 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 }
1518 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001519 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001521 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 Py_ssize_t i;
1523
Victor Stinnera0702ab2011-09-29 14:14:38 +02001524 for (i=0; i < how_many; i++) {
1525 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001526 if (ch > to_maxchar)
1527 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001528 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1529 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 }
1531 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001532 return 0;
1533}
1534
Victor Stinnerd3f08822012-05-29 12:57:52 +02001535void
1536_PyUnicode_FastCopyCharacters(
1537 PyObject *to, Py_ssize_t to_start,
1538 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539{
1540 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1541}
1542
1543Py_ssize_t
1544PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start,
1546 Py_ssize_t how_many)
1547{
1548 int err;
1549
1550 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1551 PyErr_BadInternalCall();
1552 return -1;
1553 }
1554
Benjamin Petersonbac79492012-01-14 13:34:47 -05001555 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001557 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 return -1;
1559
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001560 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001561 PyErr_SetString(PyExc_IndexError, "string index out of range");
1562 return -1;
1563 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001564 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001565 PyErr_SetString(PyExc_IndexError, "string index out of range");
1566 return -1;
1567 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001568 if (how_many < 0) {
1569 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1570 return -1;
1571 }
1572 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1574 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001575 "Cannot write %zi characters at %zi "
1576 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 how_many, to_start, PyUnicode_GET_LENGTH(to));
1578 return -1;
1579 }
1580
1581 if (how_many == 0)
1582 return 0;
1583
Victor Stinner488fa492011-12-12 00:01:39 +01001584 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001585 return -1;
1586
1587 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1588 if (err) {
1589 PyErr_Format(PyExc_SystemError,
1590 "Cannot copy %s characters "
1591 "into a string of %s characters",
1592 unicode_kind_name(from),
1593 unicode_kind_name(to));
1594 return -1;
1595 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001596 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597}
1598
Victor Stinner17222162011-09-28 22:15:37 +02001599/* Find the maximum code point and count the number of surrogate pairs so a
1600 correct string length can be computed before converting a string to UCS4.
1601 This function counts single surrogates as a character and not as a pair.
1602
1603 Return 0 on success, or -1 on error. */
1604static int
1605find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1606 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607{
1608 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001609 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610
Victor Stinnerc53be962011-10-02 21:33:54 +02001611 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 *num_surrogates = 0;
1613 *maxchar = 0;
1614
1615 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001617 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1618 && (iter+1) < end
1619 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1620 {
1621 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1622 ++(*num_surrogates);
1623 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001624 }
1625 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001627 {
1628 ch = *iter;
1629 iter++;
1630 }
1631 if (ch > *maxchar) {
1632 *maxchar = ch;
1633 if (*maxchar > MAX_UNICODE) {
1634 PyErr_Format(PyExc_ValueError,
1635 "character U+%x is not in range [U+0000; U+10ffff]",
1636 ch);
1637 return -1;
1638 }
1639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 }
1641 return 0;
1642}
1643
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001644int
1645_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646{
1647 wchar_t *end;
1648 Py_UCS4 maxchar = 0;
1649 Py_ssize_t num_surrogates;
1650#if SIZEOF_WCHAR_T == 2
1651 Py_ssize_t length_wo_surrogates;
1652#endif
1653
Georg Brandl7597add2011-10-05 16:36:47 +02001654 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 strings were created using _PyObject_New() and where no canonical
1656 representation (the str field) has been set yet aka strings
1657 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001658 assert(_PyUnicode_CHECK(unicode));
1659 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001661 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001662 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001663 /* Actually, it should neither be interned nor be anything else: */
1664 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001667 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001668 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670
1671 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1673 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 PyErr_NoMemory();
1675 return -1;
1676 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001677 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 _PyUnicode_WSTR(unicode), end,
1679 PyUnicode_1BYTE_DATA(unicode));
1680 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1681 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1682 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1683 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001684 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001685 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001686 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
1688 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001689 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001690 _PyUnicode_UTF8(unicode) = NULL;
1691 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 }
1693 PyObject_FREE(_PyUnicode_WSTR(unicode));
1694 _PyUnicode_WSTR(unicode) = NULL;
1695 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1696 }
1697 /* In this case we might have to convert down from 4-byte native
1698 wchar_t to 2-byte unicode. */
1699 else if (maxchar < 65536) {
1700 assert(num_surrogates == 0 &&
1701 "FindMaxCharAndNumSurrogatePairs() messed up");
1702
Victor Stinner506f5922011-09-28 22:34:18 +02001703#if SIZEOF_WCHAR_T == 2
1704 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001706 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1707 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1708 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001709 _PyUnicode_UTF8(unicode) = NULL;
1710 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001711#else
1712 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001713 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001714 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001715 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001716 PyErr_NoMemory();
1717 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
Victor Stinner506f5922011-09-28 22:34:18 +02001719 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1720 _PyUnicode_WSTR(unicode), end,
1721 PyUnicode_2BYTE_DATA(unicode));
1722 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1723 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1724 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001725 _PyUnicode_UTF8(unicode) = NULL;
1726 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001727 PyObject_FREE(_PyUnicode_WSTR(unicode));
1728 _PyUnicode_WSTR(unicode) = NULL;
1729 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1730#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 }
1732 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1733 else {
1734#if SIZEOF_WCHAR_T == 2
1735 /* in case the native representation is 2-bytes, we need to allocate a
1736 new normalized 4-byte version. */
1737 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001738 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1739 PyErr_NoMemory();
1740 return -1;
1741 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001742 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1743 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 PyErr_NoMemory();
1745 return -1;
1746 }
1747 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1748 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001751 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1752 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001753 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 PyObject_FREE(_PyUnicode_WSTR(unicode));
1755 _PyUnicode_WSTR(unicode) = NULL;
1756 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1757#else
1758 assert(num_surrogates == 0);
1759
Victor Stinnerc3c74152011-10-02 20:39:55 +02001760 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001762 _PyUnicode_UTF8(unicode) = NULL;
1763 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1765#endif
1766 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1767 }
1768 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001769 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 return 0;
1771}
1772
Alexander Belopolsky40018472011-02-26 01:02:56 +00001773static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001774unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775{
Walter Dörwald16807132007-05-25 13:52:07 +00001776 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 case SSTATE_NOT_INTERNED:
1778 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001779
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 case SSTATE_INTERNED_MORTAL:
1781 /* revive dead object temporarily for DelItem */
1782 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001783 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 Py_FatalError(
1785 "deletion of interned string failed");
1786 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001787
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 case SSTATE_INTERNED_IMMORTAL:
1789 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001790 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001791
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 default:
1793 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001794 }
1795
Victor Stinner03490912011-10-03 23:45:12 +02001796 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001798 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001799 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001800 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1801 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001803 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804}
1805
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001806#ifdef Py_DEBUG
1807static int
1808unicode_is_singleton(PyObject *unicode)
1809{
1810 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1811 if (unicode == unicode_empty)
1812 return 1;
1813 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1814 {
1815 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1816 if (ch < 256 && unicode_latin1[ch] == unicode)
1817 return 1;
1818 }
1819 return 0;
1820}
1821#endif
1822
Alexander Belopolsky40018472011-02-26 01:02:56 +00001823static int
Victor Stinner488fa492011-12-12 00:01:39 +01001824unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001825{
Victor Stinner488fa492011-12-12 00:01:39 +01001826 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001827 if (Py_REFCNT(unicode) != 1)
1828 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001829 if (_PyUnicode_HASH(unicode) != -1)
1830 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001831 if (PyUnicode_CHECK_INTERNED(unicode))
1832 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001833 if (!PyUnicode_CheckExact(unicode))
1834 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001835#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001836 /* singleton refcount is greater than 1 */
1837 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001838#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001839 return 1;
1840}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001841
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842static int
1843unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1844{
1845 PyObject *unicode;
1846 Py_ssize_t old_length;
1847
1848 assert(p_unicode != NULL);
1849 unicode = *p_unicode;
1850
1851 assert(unicode != NULL);
1852 assert(PyUnicode_Check(unicode));
1853 assert(0 <= length);
1854
Victor Stinner910337b2011-10-03 03:20:16 +02001855 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001856 old_length = PyUnicode_WSTR_LENGTH(unicode);
1857 else
1858 old_length = PyUnicode_GET_LENGTH(unicode);
1859 if (old_length == length)
1860 return 0;
1861
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001862 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001863 _Py_INCREF_UNICODE_EMPTY();
1864 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001866 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001867 return 0;
1868 }
1869
Victor Stinner488fa492011-12-12 00:01:39 +01001870 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 PyObject *copy = resize_copy(unicode, length);
1872 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001874 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001875 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001876 }
1877
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001879 PyObject *new_unicode = resize_compact(unicode, length);
1880 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001881 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001882 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001883 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001884 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001885 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001886}
1887
Alexander Belopolsky40018472011-02-26 01:02:56 +00001888int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001890{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001891 PyObject *unicode;
1892 if (p_unicode == NULL) {
1893 PyErr_BadInternalCall();
1894 return -1;
1895 }
1896 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001897 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001898 {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001903}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001904
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001905/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001906
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001907 WARNING: The function doesn't copy the terminating null character and
1908 doesn't check the maximum character (may write a latin1 character in an
1909 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001910static void
1911unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1912 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001913{
1914 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1915 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917
1918 switch (kind) {
1919 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001920 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001921#ifdef Py_DEBUG
1922 if (PyUnicode_IS_ASCII(unicode)) {
1923 Py_UCS4 maxchar = ucs1lib_find_max_char(
1924 (const Py_UCS1*)str,
1925 (const Py_UCS1*)str + len);
1926 assert(maxchar < 128);
1927 }
1928#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001929 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001930 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001931 }
1932 case PyUnicode_2BYTE_KIND: {
1933 Py_UCS2 *start = (Py_UCS2 *)data + index;
1934 Py_UCS2 *ucs2 = start;
1935 assert(index <= PyUnicode_GET_LENGTH(unicode));
1936
Victor Stinner184252a2012-06-16 02:57:41 +02001937 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001938 *ucs2 = (Py_UCS2)*str;
1939
1940 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001941 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001942 }
1943 default: {
1944 Py_UCS4 *start = (Py_UCS4 *)data + index;
1945 Py_UCS4 *ucs4 = start;
1946 assert(kind == PyUnicode_4BYTE_KIND);
1947 assert(index <= PyUnicode_GET_LENGTH(unicode));
1948
Victor Stinner184252a2012-06-16 02:57:41 +02001949 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001950 *ucs4 = (Py_UCS4)*str;
1951
1952 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001953 }
1954 }
1955}
1956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957static PyObject*
1958get_latin1_char(unsigned char ch)
1959{
Victor Stinnera464fc12011-10-02 20:39:30 +02001960 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001962 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 if (!unicode)
1964 return NULL;
1965 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001966 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 unicode_latin1[ch] = unicode;
1968 }
1969 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001970 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971}
1972
Victor Stinner985a82a2014-01-03 12:53:47 +01001973static PyObject*
1974unicode_char(Py_UCS4 ch)
1975{
1976 PyObject *unicode;
1977
1978 assert(ch <= MAX_UNICODE);
1979
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001980 if (ch < 256)
1981 return get_latin1_char(ch);
1982
Victor Stinner985a82a2014-01-03 12:53:47 +01001983 unicode = PyUnicode_New(1, ch);
1984 if (unicode == NULL)
1985 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001986
1987 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1988 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001990 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001991 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1992 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1993 }
1994 assert(_PyUnicode_CheckConsistency(unicode, 1));
1995 return unicode;
1996}
1997
Alexander Belopolsky40018472011-02-26 01:02:56 +00001998PyObject *
1999PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002001 if (u == NULL)
2002 return (PyObject*)_PyUnicode_New(size);
2003
2004 if (size < 0) {
2005 PyErr_BadInternalCall();
2006 return NULL;
2007 }
2008
2009 return PyUnicode_FromWideChar(u, size);
2010}
2011
2012PyObject *
2013PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2014{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002015 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 Py_UCS4 maxchar = 0;
2017 Py_ssize_t num_surrogates;
2018
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002019 if (u == NULL && size != 0) {
2020 PyErr_BadInternalCall();
2021 return NULL;
2022 }
2023
2024 if (size == -1) {
2025 size = wcslen(u);
2026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002028 /* If the Unicode data is known at construction time, we can apply
2029 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002032 if (size == 0)
2033 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 /* Single character Unicode objects in the Latin-1 range are
2036 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002037 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 return get_latin1_char((unsigned char)*u);
2039
2040 /* If not empty and not single character, copy the Unicode data
2041 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002042 if (find_maxchar_surrogates(u, u + size,
2043 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return NULL;
2045
Victor Stinner8faf8212011-12-08 22:14:11 +01002046 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 if (!unicode)
2048 return NULL;
2049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 switch (PyUnicode_KIND(unicode)) {
2051 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002052 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2054 break;
2055 case PyUnicode_2BYTE_KIND:
2056#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002057 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002059 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2061#endif
2062 break;
2063 case PyUnicode_4BYTE_KIND:
2064#if SIZEOF_WCHAR_T == 2
2065 /* This is the only case which has to process surrogates, thus
2066 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002067 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068#else
2069 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002070 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071#endif
2072 break;
2073 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002074 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002077 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078}
2079
Alexander Belopolsky40018472011-02-26 01:02:56 +00002080PyObject *
2081PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002082{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002083 if (size < 0) {
2084 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002086 return NULL;
2087 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002088 if (u != NULL)
2089 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2090 else
2091 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002092}
2093
Alexander Belopolsky40018472011-02-26 01:02:56 +00002094PyObject *
2095PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002096{
2097 size_t size = strlen(u);
2098 if (size > PY_SSIZE_T_MAX) {
2099 PyErr_SetString(PyExc_OverflowError, "input too long");
2100 return NULL;
2101 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002102 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002103}
2104
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002105PyObject *
2106_PyUnicode_FromId(_Py_Identifier *id)
2107{
2108 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002109 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2110 strlen(id->string),
2111 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002112 if (!id->object)
2113 return NULL;
2114 PyUnicode_InternInPlace(&id->object);
2115 assert(!id->next);
2116 id->next = static_strings;
2117 static_strings = id;
2118 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002119 return id->object;
2120}
2121
2122void
2123_PyUnicode_ClearStaticStrings()
2124{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002125 _Py_Identifier *tmp, *s = static_strings;
2126 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002127 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002128 tmp = s->next;
2129 s->next = NULL;
2130 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002131 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002132 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002133}
2134
Benjamin Peterson0df54292012-03-26 14:50:32 -04002135/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002136
Victor Stinnerd3f08822012-05-29 12:57:52 +02002137PyObject*
2138_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002139{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002140 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002141 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002142 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002143#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002144 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002145#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002146 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002147 }
Victor Stinner785938e2011-12-11 20:09:03 +01002148 unicode = PyUnicode_New(size, 127);
2149 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002150 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002151 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2152 assert(_PyUnicode_CheckConsistency(unicode, 1));
2153 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002154}
2155
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002156static Py_UCS4
2157kind_maxchar_limit(unsigned int kind)
2158{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002159 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002160 case PyUnicode_1BYTE_KIND:
2161 return 0x80;
2162 case PyUnicode_2BYTE_KIND:
2163 return 0x100;
2164 case PyUnicode_4BYTE_KIND:
2165 return 0x10000;
2166 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002167 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002168 }
2169}
2170
Victor Stinner702c7342011-10-05 13:50:52 +02002171static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002172_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002175 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002176
Serhiy Storchaka678db842013-01-26 12:16:36 +02002177 if (size == 0)
2178 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002180 if (size == 1)
2181 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002182
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002183 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002184 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 if (!res)
2186 return NULL;
2187 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002188 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002190}
2191
Victor Stinnere57b1c02011-09-28 22:20:48 +02002192static PyObject*
2193_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194{
2195 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002196 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002197
Serhiy Storchaka678db842013-01-26 12:16:36 +02002198 if (size == 0)
2199 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002200 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002201 if (size == 1)
2202 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002203
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002204 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002205 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 if (!res)
2207 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002210 else {
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2213 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002214 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 return res;
2216}
2217
Victor Stinnere57b1c02011-09-28 22:20:48 +02002218static PyObject*
2219_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220{
2221 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002223
Serhiy Storchaka678db842013-01-26 12:16:36 +02002224 if (size == 0)
2225 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002226 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002227 if (size == 1)
2228 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002229
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002231 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 if (!res)
2233 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002234 if (max_char < 256)
2235 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2236 PyUnicode_1BYTE_DATA(res));
2237 else if (max_char < 0x10000)
2238 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2239 PyUnicode_2BYTE_DATA(res));
2240 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002242 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 return res;
2244}
2245
2246PyObject*
2247PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2248{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002249 if (size < 0) {
2250 PyErr_SetString(PyExc_ValueError, "size must be positive");
2251 return NULL;
2252 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002253 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002255 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002257 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002259 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002260 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002261 PyErr_SetString(PyExc_SystemError, "invalid kind");
2262 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264}
2265
Victor Stinnerece58de2012-04-23 23:36:38 +02002266Py_UCS4
2267_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2268{
2269 enum PyUnicode_Kind kind;
2270 void *startptr, *endptr;
2271
2272 assert(PyUnicode_IS_READY(unicode));
2273 assert(0 <= start);
2274 assert(end <= PyUnicode_GET_LENGTH(unicode));
2275 assert(start <= end);
2276
2277 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2278 return PyUnicode_MAX_CHAR_VALUE(unicode);
2279
2280 if (start == end)
2281 return 127;
2282
Victor Stinner94d558b2012-04-27 22:26:58 +02002283 if (PyUnicode_IS_ASCII(unicode))
2284 return 127;
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002287 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002288 endptr = (char *)startptr + end * kind;
2289 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002290 switch(kind) {
2291 case PyUnicode_1BYTE_KIND:
2292 return ucs1lib_find_max_char(startptr, endptr);
2293 case PyUnicode_2BYTE_KIND:
2294 return ucs2lib_find_max_char(startptr, endptr);
2295 case PyUnicode_4BYTE_KIND:
2296 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002297 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002298 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002299 }
2300}
2301
Victor Stinner25a4b292011-10-06 12:31:55 +02002302/* Ensure that a string uses the most efficient storage, if it is not the
2303 case: create a new string with of the right kind. Write NULL into *p_unicode
2304 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002305static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002306unicode_adjust_maxchar(PyObject **p_unicode)
2307{
2308 PyObject *unicode, *copy;
2309 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002310 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002311 unsigned int kind;
2312
2313 assert(p_unicode != NULL);
2314 unicode = *p_unicode;
2315 assert(PyUnicode_IS_READY(unicode));
2316 if (PyUnicode_IS_ASCII(unicode))
2317 return;
2318
2319 len = PyUnicode_GET_LENGTH(unicode);
2320 kind = PyUnicode_KIND(unicode);
2321 if (kind == PyUnicode_1BYTE_KIND) {
2322 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 max_char = ucs1lib_find_max_char(u, u + len);
2324 if (max_char >= 128)
2325 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002326 }
2327 else if (kind == PyUnicode_2BYTE_KIND) {
2328 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002329 max_char = ucs2lib_find_max_char(u, u + len);
2330 if (max_char >= 256)
2331 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 }
2333 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002334 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002335 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002336 max_char = ucs4lib_find_max_char(u, u + len);
2337 if (max_char >= 0x10000)
2338 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002339 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002340 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002341 if (copy != NULL)
2342 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002343 Py_DECREF(unicode);
2344 *p_unicode = copy;
2345}
2346
Victor Stinner034f6cf2011-09-30 02:26:44 +02002347PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002348_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002349{
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002351 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002352
Victor Stinner034f6cf2011-09-30 02:26:44 +02002353 if (!PyUnicode_Check(unicode)) {
2354 PyErr_BadInternalCall();
2355 return NULL;
2356 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002357 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002358 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002359
Victor Stinner87af4f22011-11-21 23:03:47 +01002360 length = PyUnicode_GET_LENGTH(unicode);
2361 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002362 if (!copy)
2363 return NULL;
2364 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2365
Christian Heimesf051e432016-09-13 20:22:02 +02002366 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002367 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002368 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002369 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370}
2371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373/* Widen Unicode objects to larger buffers. Don't write terminating null
2374 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375
2376void*
2377_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2378{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 Py_ssize_t len;
2380 void *result;
2381 unsigned int skind;
2382
Benjamin Petersonbac79492012-01-14 13:34:47 -05002383 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 return NULL;
2385
2386 len = PyUnicode_GET_LENGTH(s);
2387 skind = PyUnicode_KIND(s);
2388 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002389 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 return NULL;
2391 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002392 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002394 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 if (!result)
2396 return PyErr_NoMemory();
2397 assert(skind == PyUnicode_1BYTE_KIND);
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS1, Py_UCS2,
2400 PyUnicode_1BYTE_DATA(s),
2401 PyUnicode_1BYTE_DATA(s) + len,
2402 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002405 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002406 if (!result)
2407 return PyErr_NoMemory();
2408 if (skind == PyUnicode_2BYTE_KIND) {
2409 _PyUnicode_CONVERT_BYTES(
2410 Py_UCS2, Py_UCS4,
2411 PyUnicode_2BYTE_DATA(s),
2412 PyUnicode_2BYTE_DATA(s) + len,
2413 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 else {
2416 assert(skind == PyUnicode_1BYTE_KIND);
2417 _PyUnicode_CONVERT_BYTES(
2418 Py_UCS1, Py_UCS4,
2419 PyUnicode_1BYTE_DATA(s),
2420 PyUnicode_1BYTE_DATA(s) + len,
2421 result);
2422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 default:
2425 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 }
Victor Stinner01698042011-10-04 00:04:26 +02002427 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return NULL;
2429}
2430
2431static Py_UCS4*
2432as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2433 int copy_null)
2434{
2435 int kind;
2436 void *data;
2437 Py_ssize_t len, targetlen;
2438 if (PyUnicode_READY(string) == -1)
2439 return NULL;
2440 kind = PyUnicode_KIND(string);
2441 data = PyUnicode_DATA(string);
2442 len = PyUnicode_GET_LENGTH(string);
2443 targetlen = len;
2444 if (copy_null)
2445 targetlen++;
2446 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002447 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (!target) {
2449 PyErr_NoMemory();
2450 return NULL;
2451 }
2452 }
2453 else {
2454 if (targetsize < targetlen) {
2455 PyErr_Format(PyExc_SystemError,
2456 "string is longer than the buffer");
2457 if (copy_null && 0 < targetsize)
2458 target[0] = 0;
2459 return NULL;
2460 }
2461 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002462 if (kind == PyUnicode_1BYTE_KIND) {
2463 Py_UCS1 *start = (Py_UCS1 *) data;
2464 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002466 else if (kind == PyUnicode_2BYTE_KIND) {
2467 Py_UCS2 *start = (Py_UCS2 *) data;
2468 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2469 }
2470 else {
2471 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002472 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 if (copy_null)
2475 target[len] = 0;
2476 return target;
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2481 int copy_null)
2482{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002483 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 PyErr_BadInternalCall();
2485 return NULL;
2486 }
2487 return as_ucs4(string, target, targetsize, copy_null);
2488}
2489
2490Py_UCS4*
2491PyUnicode_AsUCS4Copy(PyObject *string)
2492{
2493 return as_ucs4(string, NULL, 0, 1);
2494}
2495
Victor Stinner15a11362012-10-06 23:48:20 +02002496/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002497 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2498 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2499#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002500
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002501static int
2502unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2503 Py_ssize_t width, Py_ssize_t precision)
2504{
2505 Py_ssize_t length, fill, arglen;
2506 Py_UCS4 maxchar;
2507
2508 if (PyUnicode_READY(str) == -1)
2509 return -1;
2510
2511 length = PyUnicode_GET_LENGTH(str);
2512 if ((precision == -1 || precision >= length)
2513 && width <= length)
2514 return _PyUnicodeWriter_WriteStr(writer, str);
2515
2516 if (precision != -1)
2517 length = Py_MIN(precision, length);
2518
2519 arglen = Py_MAX(length, width);
2520 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2521 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2522 else
2523 maxchar = writer->maxchar;
2524
2525 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2526 return -1;
2527
2528 if (width > length) {
2529 fill = width - length;
2530 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2531 return -1;
2532 writer->pos += fill;
2533 }
2534
2535 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2536 str, 0, length);
2537 writer->pos += length;
2538 return 0;
2539}
2540
2541static int
Victor Stinner998b8062018-09-12 00:23:25 +02002542unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002543 Py_ssize_t width, Py_ssize_t precision)
2544{
2545 /* UTF-8 */
2546 Py_ssize_t length;
2547 PyObject *unicode;
2548 int res;
2549
2550 length = strlen(str);
2551 if (precision != -1)
2552 length = Py_MIN(length, precision);
2553 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2554 if (unicode == NULL)
2555 return -1;
2556
2557 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2558 Py_DECREF(unicode);
2559 return res;
2560}
2561
Victor Stinner96865452011-03-01 23:44:09 +00002562static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002563unicode_fromformat_arg(_PyUnicodeWriter *writer,
2564 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002565{
Victor Stinnere215d962012-10-06 23:03:36 +02002566 const char *p;
2567 Py_ssize_t len;
2568 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002569 Py_ssize_t width;
2570 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002571 int longflag;
2572 int longlongflag;
2573 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002575
2576 p = f;
2577 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002578 zeropad = 0;
2579 if (*f == '0') {
2580 zeropad = 1;
2581 f++;
2582 }
Victor Stinner96865452011-03-01 23:44:09 +00002583
2584 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 width = -1;
2586 if (Py_ISDIGIT((unsigned)*f)) {
2587 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002588 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002589 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002591 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002593 return NULL;
2594 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002596 f++;
2597 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 }
2599 precision = -1;
2600 if (*f == '.') {
2601 f++;
2602 if (Py_ISDIGIT((unsigned)*f)) {
2603 precision = (*f - '0');
2604 f++;
2605 while (Py_ISDIGIT((unsigned)*f)) {
2606 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2607 PyErr_SetString(PyExc_ValueError,
2608 "precision too big");
2609 return NULL;
2610 }
2611 precision = (precision * 10) + (*f - '0');
2612 f++;
2613 }
2614 }
Victor Stinner96865452011-03-01 23:44:09 +00002615 if (*f == '%') {
2616 /* "%.3%s" => f points to "3" */
2617 f--;
2618 }
2619 }
2620 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002621 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002622 f--;
2623 }
Victor Stinner96865452011-03-01 23:44:09 +00002624
2625 /* Handle %ld, %lu, %lld and %llu. */
2626 longflag = 0;
2627 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002628 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002629 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002630 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002631 longflag = 1;
2632 ++f;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longlongflag = 1;
2637 f += 2;
2638 }
Victor Stinner96865452011-03-01 23:44:09 +00002639 }
2640 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 size_tflag = 1;
2643 ++f;
2644 }
Victor Stinnere215d962012-10-06 23:03:36 +02002645
2646 if (f[1] == '\0')
2647 writer->overallocate = 0;
2648
2649 switch (*f) {
2650 case 'c':
2651 {
2652 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002653 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002654 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002655 "character argument not in range(0x110000)");
2656 return NULL;
2657 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002658 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002659 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002660 break;
2661 }
2662
2663 case 'i':
2664 case 'd':
2665 case 'u':
2666 case 'x':
2667 {
2668 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002669 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002671
2672 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002673 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002674 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002675 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002676 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002677 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002678 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002679 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, size_t));
2682 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002683 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002684 va_arg(*vargs, unsigned int));
2685 }
2686 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002687 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002688 }
2689 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002690 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002691 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002692 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002693 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002694 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002695 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002696 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, Py_ssize_t));
2699 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002700 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002701 va_arg(*vargs, int));
2702 }
2703 assert(len >= 0);
2704
Victor Stinnere215d962012-10-06 23:03:36 +02002705 if (precision < len)
2706 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707
2708 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2710 return NULL;
2711
Victor Stinnere215d962012-10-06 23:03:36 +02002712 if (width > precision) {
2713 Py_UCS4 fillchar;
2714 fill = width - precision;
2715 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002716 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2717 return NULL;
2718 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002719 }
Victor Stinner15a11362012-10-06 23:48:20 +02002720 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002721 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002722 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2723 return NULL;
2724 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726
Victor Stinner4a587072013-11-19 12:54:53 +01002727 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2728 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002729 break;
2730 }
2731
2732 case 'p':
2733 {
2734 char number[MAX_LONG_LONG_CHARS];
2735
2736 len = sprintf(number, "%p", va_arg(*vargs, void*));
2737 assert(len >= 0);
2738
2739 /* %p is ill-defined: ensure leading 0x. */
2740 if (number[1] == 'X')
2741 number[1] = 'x';
2742 else if (number[1] != 'x') {
2743 memmove(number + 2, number,
2744 strlen(number) + 1);
2745 number[0] = '0';
2746 number[1] = 'x';
2747 len += 2;
2748 }
2749
Victor Stinner4a587072013-11-19 12:54:53 +01002750 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002751 return NULL;
2752 break;
2753 }
2754
2755 case 's':
2756 {
2757 /* UTF-8 */
2758 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002759 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002760 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002761 break;
2762 }
2763
2764 case 'U':
2765 {
2766 PyObject *obj = va_arg(*vargs, PyObject *);
2767 assert(obj && _PyUnicode_CHECK(obj));
2768
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
2771 break;
2772 }
2773
2774 case 'V':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002778 if (obj) {
2779 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
2782 }
2783 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002785 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002786 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002787 }
2788 break;
2789 }
2790
2791 case 'S':
2792 {
2793 PyObject *obj = va_arg(*vargs, PyObject *);
2794 PyObject *str;
2795 assert(obj);
2796 str = PyObject_Str(obj);
2797 if (!str)
2798 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002799 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002800 Py_DECREF(str);
2801 return NULL;
2802 }
2803 Py_DECREF(str);
2804 break;
2805 }
2806
2807 case 'R':
2808 {
2809 PyObject *obj = va_arg(*vargs, PyObject *);
2810 PyObject *repr;
2811 assert(obj);
2812 repr = PyObject_Repr(obj);
2813 if (!repr)
2814 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002815 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002816 Py_DECREF(repr);
2817 return NULL;
2818 }
2819 Py_DECREF(repr);
2820 break;
2821 }
2822
2823 case 'A':
2824 {
2825 PyObject *obj = va_arg(*vargs, PyObject *);
2826 PyObject *ascii;
2827 assert(obj);
2828 ascii = PyObject_ASCII(obj);
2829 if (!ascii)
2830 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002831 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002832 Py_DECREF(ascii);
2833 return NULL;
2834 }
2835 Py_DECREF(ascii);
2836 break;
2837 }
2838
2839 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002840 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002841 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002842 break;
2843
2844 default:
2845 /* if we stumble upon an unknown formatting code, copy the rest
2846 of the format string to the output string. (we cannot just
2847 skip the code, since there's no way to know what's in the
2848 argument list) */
2849 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002850 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
2852 f = p+len;
2853 return f;
2854 }
2855
2856 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002857 return f;
2858}
2859
Walter Dörwaldd2034312007-05-18 16:29:38 +00002860PyObject *
2861PyUnicode_FromFormatV(const char *format, va_list vargs)
2862{
Victor Stinnere215d962012-10-06 23:03:36 +02002863 va_list vargs2;
2864 const char *f;
2865 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866
Victor Stinner8f674cc2013-04-17 23:02:17 +02002867 _PyUnicodeWriter_Init(&writer);
2868 writer.min_length = strlen(format) + 100;
2869 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002870
Benjamin Peterson0c212142016-09-20 20:39:33 -07002871 // Copy varags to be able to pass a reference to a subfunction.
2872 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002873
2874 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002876 f = unicode_fromformat_arg(&writer, f, &vargs2);
2877 if (f == NULL)
2878 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002881 const char *p;
2882 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002883
Victor Stinnere215d962012-10-06 23:03:36 +02002884 p = f;
2885 do
2886 {
2887 if ((unsigned char)*p > 127) {
2888 PyErr_Format(PyExc_ValueError,
2889 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2890 "string, got a non-ASCII byte: 0x%02x",
2891 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002892 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002893 }
2894 p++;
2895 }
2896 while (*p != '\0' && *p != '%');
2897 len = p - f;
2898
2899 if (*p == '\0')
2900 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002901
2902 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002903 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002904
2905 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002906 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002908 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002909 return _PyUnicodeWriter_Finish(&writer);
2910
2911 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002912 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002913 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002914 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002915}
2916
Walter Dörwaldd2034312007-05-18 16:29:38 +00002917PyObject *
2918PyUnicode_FromFormat(const char *format, ...)
2919{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002920 PyObject* ret;
2921 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922
2923#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002926 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 ret = PyUnicode_FromFormatV(format, vargs);
2929 va_end(vargs);
2930 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002931}
2932
Serhiy Storchakac46db922018-10-23 22:58:24 +03002933static Py_ssize_t
2934unicode_get_widechar_size(PyObject *unicode)
2935{
2936 Py_ssize_t res;
2937
2938 assert(unicode != NULL);
2939 assert(_PyUnicode_CHECK(unicode));
2940
2941 if (_PyUnicode_WSTR(unicode) != NULL) {
2942 return PyUnicode_WSTR_LENGTH(unicode);
2943 }
2944 assert(PyUnicode_IS_READY(unicode));
2945
2946 res = _PyUnicode_LENGTH(unicode);
2947#if SIZEOF_WCHAR_T == 2
2948 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2949 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2950 const Py_UCS4 *end = s + res;
2951 for (; s < end; ++s) {
2952 if (*s > 0xFFFF) {
2953 ++res;
2954 }
2955 }
2956 }
2957#endif
2958 return res;
2959}
2960
2961static void
2962unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2963{
2964 const wchar_t *wstr;
2965
2966 assert(unicode != NULL);
2967 assert(_PyUnicode_CHECK(unicode));
2968
2969 wstr = _PyUnicode_WSTR(unicode);
2970 if (wstr != NULL) {
2971 memcpy(w, wstr, size * sizeof(wchar_t));
2972 return;
2973 }
2974 assert(PyUnicode_IS_READY(unicode));
2975
2976 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2977 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2978 for (; size--; ++s, ++w) {
2979 *w = *s;
2980 }
2981 }
2982 else {
2983#if SIZEOF_WCHAR_T == 4
2984 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2985 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2986 for (; size--; ++s, ++w) {
2987 *w = *s;
2988 }
2989#else
2990 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2991 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2992 for (; size--; ++s, ++w) {
2993 Py_UCS4 ch = *s;
2994 if (ch > 0xFFFF) {
2995 assert(ch <= MAX_UNICODE);
2996 /* encode surrogate pair in this case */
2997 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2998 if (!size--)
2999 break;
3000 *w = Py_UNICODE_LOW_SURROGATE(ch);
3001 }
3002 else {
3003 *w = ch;
3004 }
3005 }
3006#endif
3007 }
3008}
3009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003010#ifdef HAVE_WCHAR_H
3011
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003012/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003013
Victor Stinnerd88d9832011-09-06 02:00:05 +02003014 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 character) required to convert the unicode object. Ignore size argument.
3016
Victor Stinnerd88d9832011-09-06 02:00:05 +02003017 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003018 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003019 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003020Py_ssize_t
3021PyUnicode_AsWideChar(PyObject *unicode,
3022 wchar_t *w,
3023 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003024{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003025 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003026
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003027 if (unicode == NULL) {
3028 PyErr_BadInternalCall();
3029 return -1;
3030 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003031 if (!PyUnicode_Check(unicode)) {
3032 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003034 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003035
3036 res = unicode_get_widechar_size(unicode);
3037 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003038 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003039 }
3040
3041 if (size > res) {
3042 size = res + 1;
3043 }
3044 else {
3045 res = size;
3046 }
3047 unicode_copy_as_widechar(unicode, w, size);
3048 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003049}
3050
Victor Stinner137c34c2010-09-29 10:25:54 +00003051wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003052PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003053 Py_ssize_t *size)
3054{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003055 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003056 Py_ssize_t buflen;
3057
3058 if (unicode == NULL) {
3059 PyErr_BadInternalCall();
3060 return NULL;
3061 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003062 if (!PyUnicode_Check(unicode)) {
3063 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003064 return NULL;
3065 }
3066
Serhiy Storchakac46db922018-10-23 22:58:24 +03003067 buflen = unicode_get_widechar_size(unicode);
3068 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003069 if (buffer == NULL) {
3070 PyErr_NoMemory();
3071 return NULL;
3072 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003073 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3074 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003075 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003076 }
3077 else if (wcslen(buffer) != (size_t)buflen) {
3078 PyMem_FREE(buffer);
3079 PyErr_SetString(PyExc_ValueError,
3080 "embedded null character");
3081 return NULL;
3082 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003083 return buffer;
3084}
3085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003086#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087
Alexander Belopolsky40018472011-02-26 01:02:56 +00003088PyObject *
3089PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003090{
Victor Stinner8faf8212011-12-08 22:14:11 +01003091 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 PyErr_SetString(PyExc_ValueError,
3093 "chr() arg not in range(0x110000)");
3094 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003095 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003096
Victor Stinner985a82a2014-01-03 12:53:47 +01003097 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003098}
3099
Alexander Belopolsky40018472011-02-26 01:02:56 +00003100PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003101PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003103 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003105 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003106 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003107 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 Py_INCREF(obj);
3109 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003110 }
3111 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 /* For a Unicode subtype that's not a Unicode object,
3113 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003114 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003115 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003116 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003117 "Can't convert '%.100s' object to str implicitly",
3118 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003119 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003120}
3121
Alexander Belopolsky40018472011-02-26 01:02:56 +00003122PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003123PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003124 const char *encoding,
3125 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003126{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003127 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003128 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003129
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 PyErr_BadInternalCall();
3132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003134
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003135 /* Decoding bytes objects is the most common case and should be fast */
3136 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003137 if (PyBytes_GET_SIZE(obj) == 0)
3138 _Py_RETURN_UNICODE_EMPTY();
3139 v = PyUnicode_Decode(
3140 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3141 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003142 return v;
3143 }
3144
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003145 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 PyErr_SetString(PyExc_TypeError,
3147 "decoding str is not supported");
3148 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003149 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003150
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003151 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3152 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003154 "decoding to str: need a bytes-like object, %.80s found",
3155 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003156 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003157 }
Tim Petersced69f82003-09-16 20:30:58 +00003158
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003159 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003160 PyBuffer_Release(&buffer);
3161 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003163
Serhiy Storchaka05997252013-01-26 12:14:02 +02003164 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003165 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003166 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167}
3168
Victor Stinnerebe17e02016-10-12 13:57:45 +02003169/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3170 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3171 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003172int
3173_Py_normalize_encoding(const char *encoding,
3174 char *lower,
3175 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003177 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003178 char *l;
3179 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003180 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181
Victor Stinner942889a2016-09-05 15:40:10 -07003182 assert(encoding != NULL);
3183
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003184 e = encoding;
3185 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003186 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003187 punct = 0;
3188 while (1) {
3189 char c = *e;
3190 if (c == 0) {
3191 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003192 }
Victor Stinner942889a2016-09-05 15:40:10 -07003193
3194 if (Py_ISALNUM(c) || c == '.') {
3195 if (punct && l != lower) {
3196 if (l == l_end) {
3197 return 0;
3198 }
3199 *l++ = '_';
3200 }
3201 punct = 0;
3202
3203 if (l == l_end) {
3204 return 0;
3205 }
3206 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003207 }
3208 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003209 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003210 }
Victor Stinner942889a2016-09-05 15:40:10 -07003211
3212 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003213 }
3214 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003215 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003216}
3217
Alexander Belopolsky40018472011-02-26 01:02:56 +00003218PyObject *
3219PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003220 Py_ssize_t size,
3221 const char *encoding,
3222 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003223{
3224 PyObject *buffer = NULL, *unicode;
3225 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003226 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3227
3228 if (encoding == NULL) {
3229 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3230 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003231
Fred Drakee4315f52000-05-09 19:53:39 +00003232 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003233 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3234 char *lower = buflower;
3235
3236 /* Fast paths */
3237 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3238 lower += 3;
3239 if (*lower == '_') {
3240 /* Match "utf8" and "utf_8" */
3241 lower++;
3242 }
3243
3244 if (lower[0] == '8' && lower[1] == 0) {
3245 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3246 }
3247 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3248 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3249 }
3250 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3251 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3252 }
3253 }
3254 else {
3255 if (strcmp(lower, "ascii") == 0
3256 || strcmp(lower, "us_ascii") == 0) {
3257 return PyUnicode_DecodeASCII(s, size, errors);
3258 }
Steve Dowercc16be82016-09-08 10:35:16 -07003259 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003260 else if (strcmp(lower, "mbcs") == 0) {
3261 return PyUnicode_DecodeMBCS(s, size, errors);
3262 }
3263 #endif
3264 else if (strcmp(lower, "latin1") == 0
3265 || strcmp(lower, "latin_1") == 0
3266 || strcmp(lower, "iso_8859_1") == 0
3267 || strcmp(lower, "iso8859_1") == 0) {
3268 return PyUnicode_DecodeLatin1(s, size, errors);
3269 }
3270 }
Victor Stinner37296e82010-06-10 13:36:23 +00003271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272
3273 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003274 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003275 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003276 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003277 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 if (buffer == NULL)
3279 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003280 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 if (unicode == NULL)
3282 goto onError;
3283 if (!PyUnicode_Check(unicode)) {
3284 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003285 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003286 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003287 encoding,
3288 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 Py_DECREF(unicode);
3290 goto onError;
3291 }
3292 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003293 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003294
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 Py_XDECREF(buffer);
3297 return NULL;
3298}
3299
Alexander Belopolsky40018472011-02-26 01:02:56 +00003300PyObject *
3301PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003302 const char *encoding,
3303 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003304{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003305 if (!PyUnicode_Check(unicode)) {
3306 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003307 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003308 }
3309
Serhiy Storchaka00939072016-10-27 21:05:49 +03003310 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3311 "PyUnicode_AsDecodedObject() is deprecated; "
3312 "use PyCodec_Decode() to decode from str", 1) < 0)
3313 return NULL;
3314
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003317
3318 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003319 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003320}
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322PyObject *
3323PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding,
3325 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003326{
3327 PyObject *v;
3328
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 goto onError;
3332 }
3333
Serhiy Storchaka00939072016-10-27 21:05:49 +03003334 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335 "PyUnicode_AsDecodedUnicode() is deprecated; "
3336 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3337 return NULL;
3338
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003339 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003341
3342 /* Decode via the codec registry */
3343 v = PyCodec_Decode(unicode, encoding, errors);
3344 if (v == NULL)
3345 goto onError;
3346 if (!PyUnicode_Check(v)) {
3347 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003348 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003349 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003350 encoding,
3351 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003352 Py_DECREF(v);
3353 goto onError;
3354 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003355 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003356
Benjamin Peterson29060642009-01-31 22:14:21 +00003357 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003358 return NULL;
3359}
3360
Alexander Belopolsky40018472011-02-26 01:02:56 +00003361PyObject *
3362PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003363 Py_ssize_t size,
3364 const char *encoding,
3365 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366{
3367 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003368
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003369 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3373 Py_DECREF(unicode);
3374 return v;
3375}
3376
Alexander Belopolsky40018472011-02-26 01:02:56 +00003377PyObject *
3378PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003379 const char *encoding,
3380 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003381{
3382 PyObject *v;
3383
3384 if (!PyUnicode_Check(unicode)) {
3385 PyErr_BadArgument();
3386 goto onError;
3387 }
3388
Serhiy Storchaka00939072016-10-27 21:05:49 +03003389 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3390 "PyUnicode_AsEncodedObject() is deprecated; "
3391 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3392 "or PyCodec_Encode() for generic encoding", 1) < 0)
3393 return NULL;
3394
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003395 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003397
3398 /* Encode via the codec registry */
3399 v = PyCodec_Encode(unicode, encoding, errors);
3400 if (v == NULL)
3401 goto onError;
3402 return v;
3403
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003405 return NULL;
3406}
3407
Victor Stinner1b579672011-12-17 05:47:23 +01003408
Victor Stinner2cba6b82018-01-10 22:46:15 +01003409static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003410unicode_encode_locale(PyObject *unicode, const char *errors,
3411 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003413 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003415 Py_ssize_t wlen;
3416 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3417 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003419 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003421 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 if (wlen2 != wlen) {
3423 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003424 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 return NULL;
3426 }
3427
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003428 char *str;
3429 size_t error_pos;
3430 const char *reason;
3431 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003432 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003433 if (res != 0) {
3434 if (res == -2) {
3435 PyObject *exc;
3436 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3437 "locale", unicode,
3438 (Py_ssize_t)error_pos,
3439 (Py_ssize_t)(error_pos+1),
3440 reason);
3441 if (exc != NULL) {
3442 PyCodec_StrictErrors(exc);
3443 Py_DECREF(exc);
3444 }
3445 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003447 else if (res == -3) {
3448 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3449 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003450 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003451 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452 PyMem_Free(wstr);
3453 return NULL;
3454 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003458 PyObject *bytes = PyBytes_FromString(str);
3459 PyMem_RawFree(str);
3460 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461}
3462
Victor Stinnerad158722010-10-27 00:25:46 +00003463PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003464PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3465{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003466 return unicode_encode_locale(unicode, errors, 1);
3467}
3468
3469PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003470PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003471{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003472 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003473 const _PyCoreConfig *config = &interp->core_config;
3474#if defined(__APPLE__)
3475 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3476#else
Victor Stinner793b5312011-04-27 00:24:21 +02003477 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3478 cannot use it to encode and decode filenames before it is loaded. Load
3479 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003480 implementation of the locale codec until the codec registry is
3481 initialized and the Python codec is loaded. See initfsencoding(). */
3482 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003483 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003484 config->filesystem_encoding,
3485 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003486 }
3487 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003488 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003489 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003490 }
Victor Stinnerad158722010-10-27 00:25:46 +00003491#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494PyObject *
3495PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003496 const char *encoding,
3497 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498{
3499 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003500 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 }
Fred Drakee4315f52000-05-09 19:53:39 +00003506
Victor Stinner942889a2016-09-05 15:40:10 -07003507 if (encoding == NULL) {
3508 return _PyUnicode_AsUTF8String(unicode, errors);
3509 }
3510
Fred Drakee4315f52000-05-09 19:53:39 +00003511 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003512 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3513 char *lower = buflower;
3514
3515 /* Fast paths */
3516 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3517 lower += 3;
3518 if (*lower == '_') {
3519 /* Match "utf8" and "utf_8" */
3520 lower++;
3521 }
3522
3523 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003525 }
3526 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3527 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3528 }
3529 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3530 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3531 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003532 }
Victor Stinner942889a2016-09-05 15:40:10 -07003533 else {
3534 if (strcmp(lower, "ascii") == 0
3535 || strcmp(lower, "us_ascii") == 0) {
3536 return _PyUnicode_AsASCIIString(unicode, errors);
3537 }
Steve Dowercc16be82016-09-08 10:35:16 -07003538#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003539 else if (strcmp(lower, "mbcs") == 0) {
3540 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3541 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003542#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003543 else if (strcmp(lower, "latin1") == 0 ||
3544 strcmp(lower, "latin_1") == 0 ||
3545 strcmp(lower, "iso_8859_1") == 0 ||
3546 strcmp(lower, "iso8859_1") == 0) {
3547 return _PyUnicode_AsLatin1String(unicode, errors);
3548 }
3549 }
Victor Stinner37296e82010-06-10 13:36:23 +00003550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551
3552 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003553 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003555 return NULL;
3556
3557 /* The normal path */
3558 if (PyBytes_Check(v))
3559 return v;
3560
3561 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003562 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003563 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003564 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003565
3566 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003567 "encoder %s returned bytearray instead of bytes; "
3568 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003569 encoding);
3570 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003571 Py_DECREF(v);
3572 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003573 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003574
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003575 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3576 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003577 Py_DECREF(v);
3578 return b;
3579 }
3580
3581 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003582 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003583 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003584 encoding,
3585 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003586 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003587 return NULL;
3588}
3589
Alexander Belopolsky40018472011-02-26 01:02:56 +00003590PyObject *
3591PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003592 const char *encoding,
3593 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003594{
3595 PyObject *v;
3596
3597 if (!PyUnicode_Check(unicode)) {
3598 PyErr_BadArgument();
3599 goto onError;
3600 }
3601
Serhiy Storchaka00939072016-10-27 21:05:49 +03003602 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3603 "PyUnicode_AsEncodedUnicode() is deprecated; "
3604 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3605 return NULL;
3606
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003607 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609
3610 /* Encode via the codec registry */
3611 v = PyCodec_Encode(unicode, encoding, errors);
3612 if (v == NULL)
3613 goto onError;
3614 if (!PyUnicode_Check(v)) {
3615 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003616 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003617 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003618 encoding,
3619 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620 Py_DECREF(v);
3621 goto onError;
3622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003624
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return NULL;
3627}
3628
Victor Stinner2cba6b82018-01-10 22:46:15 +01003629static PyObject*
3630unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3631 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003632{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003633 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003635 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3636 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003637 return NULL;
3638 }
3639
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003640 wchar_t *wstr;
3641 size_t wlen;
3642 const char *reason;
3643 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003644 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003645 if (res != 0) {
3646 if (res == -2) {
3647 PyObject *exc;
3648 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3649 "locale", str, len,
3650 (Py_ssize_t)wlen,
3651 (Py_ssize_t)(wlen + 1),
3652 reason);
3653 if (exc != NULL) {
3654 PyCodec_StrictErrors(exc);
3655 Py_DECREF(exc);
3656 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003657 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003658 else if (res == -3) {
3659 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3660 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003661 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003662 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003663 }
Victor Stinner2f197072011-12-17 07:08:30 +01003664 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003665 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003666
3667 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3668 PyMem_RawFree(wstr);
3669 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003670}
3671
3672PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003673PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3674 const char *errors)
3675{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003676 return unicode_decode_locale(str, len, errors, 1);
3677}
3678
3679PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003680PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003681{
3682 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003683 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003684}
3685
3686
3687PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003688PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003690 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3691}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003692
Christian Heimes5894ba72007-11-04 11:43:14 +00003693PyObject*
3694PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3695{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003696 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003697 const _PyCoreConfig *config = &interp->core_config;
3698#if defined(__APPLE__)
3699 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3700#else
Victor Stinner793b5312011-04-27 00:24:21 +02003701 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3702 cannot use it to encode and decode filenames before it is loaded. Load
3703 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003704 implementation of the locale codec until the codec registry is
3705 initialized and the Python codec is loaded. See initfsencoding(). */
3706 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003707 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003708 config->filesystem_encoding,
3709 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 }
3711 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003712 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003713 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714 }
Victor Stinnerad158722010-10-27 00:25:46 +00003715#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716}
3717
Martin v. Löwis011e8422009-05-05 04:43:17 +00003718
3719int
3720PyUnicode_FSConverter(PyObject* arg, void* addr)
3721{
Brett Cannonec6ce872016-09-06 15:50:29 -07003722 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003723 PyObject *output = NULL;
3724 Py_ssize_t size;
3725 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003726 if (arg == NULL) {
3727 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003728 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003729 return 1;
3730 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003731 path = PyOS_FSPath(arg);
3732 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003733 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003734 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003735 if (PyBytes_Check(path)) {
3736 output = path;
3737 }
3738 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3739 output = PyUnicode_EncodeFSDefault(path);
3740 Py_DECREF(path);
3741 if (!output) {
3742 return 0;
3743 }
3744 assert(PyBytes_Check(output));
3745 }
3746
Victor Stinner0ea2a462010-04-30 00:22:08 +00003747 size = PyBytes_GET_SIZE(output);
3748 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003749 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003750 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003751 Py_DECREF(output);
3752 return 0;
3753 }
3754 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003755 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003756}
3757
3758
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003759int
3760PyUnicode_FSDecoder(PyObject* arg, void* addr)
3761{
Brett Cannona5711202016-09-06 19:36:01 -07003762 int is_buffer = 0;
3763 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003764 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003765 if (arg == NULL) {
3766 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003767 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003768 return 1;
3769 }
Brett Cannona5711202016-09-06 19:36:01 -07003770
3771 is_buffer = PyObject_CheckBuffer(arg);
3772 if (!is_buffer) {
3773 path = PyOS_FSPath(arg);
3774 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003775 return 0;
3776 }
Brett Cannona5711202016-09-06 19:36:01 -07003777 }
3778 else {
3779 path = arg;
3780 Py_INCREF(arg);
3781 }
3782
3783 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003784 output = path;
3785 }
3786 else if (PyBytes_Check(path) || is_buffer) {
3787 PyObject *path_bytes = NULL;
3788
3789 if (!PyBytes_Check(path) &&
3790 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003791 "path should be string, bytes, or os.PathLike, not %.200s",
3792 Py_TYPE(arg)->tp_name)) {
3793 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003795 }
3796 path_bytes = PyBytes_FromObject(path);
3797 Py_DECREF(path);
3798 if (!path_bytes) {
3799 return 0;
3800 }
3801 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3802 PyBytes_GET_SIZE(path_bytes));
3803 Py_DECREF(path_bytes);
3804 if (!output) {
3805 return 0;
3806 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003807 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003808 else {
3809 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003810 "path should be string, bytes, or os.PathLike, not %.200s",
3811 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003812 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003813 return 0;
3814 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003815 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003816 Py_DECREF(output);
3817 return 0;
3818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003820 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003821 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 Py_DECREF(output);
3823 return 0;
3824 }
3825 *(PyObject**)addr = output;
3826 return Py_CLEANUP_SUPPORTED;
3827}
3828
3829
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003830const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003832{
Christian Heimesf3863112007-11-22 07:46:41 +00003833 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003835 if (!PyUnicode_Check(unicode)) {
3836 PyErr_BadArgument();
3837 return NULL;
3838 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003840 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003842 if (PyUnicode_UTF8(unicode) == NULL) {
3843 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003844 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 if (bytes == NULL)
3846 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003847 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3848 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003849 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 Py_DECREF(bytes);
3851 return NULL;
3852 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003854 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003855 PyBytes_AS_STRING(bytes),
3856 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 Py_DECREF(bytes);
3858 }
3859
3860 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003861 *psize = PyUnicode_UTF8_LENGTH(unicode);
3862 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003863}
3864
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003865const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_UNICODE *
3872PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 if (!PyUnicode_Check(unicode)) {
3875 PyErr_BadArgument();
3876 return NULL;
3877 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003878 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3879 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003881 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883
Serhiy Storchakac46db922018-10-23 22:58:24 +03003884 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3885 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3886 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003889 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3890 if (w == NULL) {
3891 PyErr_NoMemory();
3892 return NULL;
3893 }
3894 unicode_copy_as_widechar(unicode, w, wlen + 1);
3895 _PyUnicode_WSTR(unicode) = w;
3896 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3897 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898 }
3899 }
3900 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003902 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003903}
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905Py_UNICODE *
3906PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909}
3910
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003911const Py_UNICODE *
3912_PyUnicode_AsUnicode(PyObject *unicode)
3913{
3914 Py_ssize_t size;
3915 const Py_UNICODE *wstr;
3916
3917 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3918 if (wstr && wcslen(wstr) != (size_t)size) {
3919 PyErr_SetString(PyExc_ValueError, "embedded null character");
3920 return NULL;
3921 }
3922 return wstr;
3923}
3924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925
Alexander Belopolsky40018472011-02-26 01:02:56 +00003926Py_ssize_t
3927PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928{
3929 if (!PyUnicode_Check(unicode)) {
3930 PyErr_BadArgument();
3931 goto onError;
3932 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003933 if (_PyUnicode_WSTR(unicode) == NULL) {
3934 if (PyUnicode_AsUnicode(unicode) == NULL)
3935 goto onError;
3936 }
3937 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 return -1;
3941}
3942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943Py_ssize_t
3944PyUnicode_GetLength(PyObject *unicode)
3945{
Victor Stinner07621332012-06-16 04:53:46 +02003946 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 PyErr_BadArgument();
3948 return -1;
3949 }
Victor Stinner07621332012-06-16 04:53:46 +02003950 if (PyUnicode_READY(unicode) == -1)
3951 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 return PyUnicode_GET_LENGTH(unicode);
3953}
3954
3955Py_UCS4
3956PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3957{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003958 void *data;
3959 int kind;
3960
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003961 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003962 PyErr_BadArgument();
3963 return (Py_UCS4)-1;
3964 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003965 if (PyUnicode_READY(unicode) == -1) {
3966 return (Py_UCS4)-1;
3967 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003968 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003969 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 return (Py_UCS4)-1;
3971 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003972 data = PyUnicode_DATA(unicode);
3973 kind = PyUnicode_KIND(unicode);
3974 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975}
3976
3977int
3978PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3979{
3980 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003981 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 return -1;
3983 }
Victor Stinner488fa492011-12-12 00:01:39 +01003984 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003985 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003986 PyErr_SetString(PyExc_IndexError, "string index out of range");
3987 return -1;
3988 }
Victor Stinner488fa492011-12-12 00:01:39 +01003989 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003990 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003991 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3992 PyErr_SetString(PyExc_ValueError, "character out of range");
3993 return -1;
3994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3996 index, ch);
3997 return 0;
3998}
3999
Alexander Belopolsky40018472011-02-26 01:02:56 +00004000const char *
4001PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004002{
Victor Stinner42cb4622010-09-01 19:39:01 +00004003 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004004}
4005
Victor Stinner554f3f02010-06-16 23:33:54 +00004006/* create or adjust a UnicodeDecodeError */
4007static void
4008make_decode_exception(PyObject **exceptionObject,
4009 const char *encoding,
4010 const char *input, Py_ssize_t length,
4011 Py_ssize_t startpos, Py_ssize_t endpos,
4012 const char *reason)
4013{
4014 if (*exceptionObject == NULL) {
4015 *exceptionObject = PyUnicodeDecodeError_Create(
4016 encoding, input, length, startpos, endpos, reason);
4017 }
4018 else {
4019 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4020 goto onError;
4021 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4022 goto onError;
4023 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4024 goto onError;
4025 }
4026 return;
4027
4028onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004029 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004030}
4031
Steve Dowercc16be82016-09-08 10:35:16 -07004032#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033/* error handling callback helper:
4034 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004035 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 and adjust various state variables.
4037 return 0 on success, -1 on error
4038*/
4039
Alexander Belopolsky40018472011-02-26 01:02:56 +00004040static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004041unicode_decode_call_errorhandler_wchar(
4042 const char *errors, PyObject **errorHandler,
4043 const char *encoding, const char *reason,
4044 const char **input, const char **inend, Py_ssize_t *startinpos,
4045 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4046 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004048 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049
4050 PyObject *restuple = NULL;
4051 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004052 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004054 Py_ssize_t requiredsize;
4055 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004056 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004057 wchar_t *repwstr;
4058 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004060 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4061 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 *errorHandler = PyCodec_LookupError(errors);
4065 if (*errorHandler == NULL)
4066 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 }
4068
Victor Stinner554f3f02010-06-16 23:33:54 +00004069 make_decode_exception(exceptionObject,
4070 encoding,
4071 *input, *inend - *input,
4072 *startinpos, *endinpos,
4073 reason);
4074 if (*exceptionObject == NULL)
4075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004077 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004081 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004084 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004086
4087 /* Copy back the bytes variables, which might have been modified by the
4088 callback */
4089 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4090 if (!inputobj)
4091 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004092 *input = PyBytes_AS_STRING(inputobj);
4093 insize = PyBytes_GET_SIZE(inputobj);
4094 *inend = *input + insize;
4095 /* we can DECREF safely, as the exception has another reference,
4096 so the object won't go away. */
4097 Py_DECREF(inputobj);
4098
4099 if (newpos<0)
4100 newpos = insize+newpos;
4101 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004102 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004103 goto onError;
4104 }
4105
4106 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4107 if (repwstr == NULL)
4108 goto onError;
4109 /* need more space? (at least enough for what we
4110 have+the replacement+the rest of the string (starting
4111 at the new input position), so we won't have to check space
4112 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004113 requiredsize = *outpos;
4114 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4115 goto overflow;
4116 requiredsize += repwlen;
4117 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4118 goto overflow;
4119 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004121 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004122 requiredsize = 2*outsize;
4123 if (unicode_resize(output, requiredsize) < 0)
4124 goto onError;
4125 }
4126 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4127 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128 *endinpos = newpos;
4129 *inptr = *input + newpos;
4130
4131 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004132 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 return 0;
4134
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004135 overflow:
4136 PyErr_SetString(PyExc_OverflowError,
4137 "decoded result is too long for a Python string");
4138
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 onError:
4140 Py_XDECREF(restuple);
4141 return -1;
4142}
Steve Dowercc16be82016-09-08 10:35:16 -07004143#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144
4145static int
4146unicode_decode_call_errorhandler_writer(
4147 const char *errors, PyObject **errorHandler,
4148 const char *encoding, const char *reason,
4149 const char **input, const char **inend, Py_ssize_t *startinpos,
4150 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4151 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4152{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004153 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154
4155 PyObject *restuple = NULL;
4156 PyObject *repunicode = NULL;
4157 Py_ssize_t insize;
4158 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004159 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004160 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004161 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004162 int need_to_grow = 0;
4163 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164
4165 if (*errorHandler == NULL) {
4166 *errorHandler = PyCodec_LookupError(errors);
4167 if (*errorHandler == NULL)
4168 goto onError;
4169 }
4170
4171 make_decode_exception(exceptionObject,
4172 encoding,
4173 *input, *inend - *input,
4174 *startinpos, *endinpos,
4175 reason);
4176 if (*exceptionObject == NULL)
4177 goto onError;
4178
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004179 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004180 if (restuple == NULL)
4181 goto onError;
4182 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004183 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004184 goto onError;
4185 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004186 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004187 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004188
4189 /* Copy back the bytes variables, which might have been modified by the
4190 callback */
4191 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4192 if (!inputobj)
4193 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004194 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004195 *input = PyBytes_AS_STRING(inputobj);
4196 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004198 /* we can DECREF safely, as the exception has another reference,
4199 so the object won't go away. */
4200 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004204 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004205 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004207 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208
Victor Stinner170ca6f2013-04-18 00:25:28 +02004209 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004210 if (replen > 1) {
4211 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004212 need_to_grow = 1;
4213 }
4214 new_inptr = *input + newpos;
4215 if (*inend - new_inptr > remain) {
4216 /* We don't know the decoding algorithm here so we make the worst
4217 assumption that one byte decodes to one unicode character.
4218 If unfortunately one byte could decode to more unicode characters,
4219 the decoder may write out-of-bound then. Is it possible for the
4220 algorithms using this function? */
4221 writer->min_length += *inend - new_inptr - remain;
4222 need_to_grow = 1;
4223 }
4224 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004225 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004226 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004227 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4228 goto onError;
4229 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004230 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004231 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004234 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004237 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004238 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243}
4244
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004245/* --- UTF-7 Codec -------------------------------------------------------- */
4246
Antoine Pitrou244651a2009-05-04 18:56:13 +00004247/* See RFC2152 for details. We encode conservatively and decode liberally. */
4248
4249/* Three simple macros defining base-64. */
4250
4251/* Is c a base-64 character? */
4252
4253#define IS_BASE64(c) \
4254 (((c) >= 'A' && (c) <= 'Z') || \
4255 ((c) >= 'a' && (c) <= 'z') || \
4256 ((c) >= '0' && (c) <= '9') || \
4257 (c) == '+' || (c) == '/')
4258
4259/* given that c is a base-64 character, what is its base-64 value? */
4260
4261#define FROM_BASE64(c) \
4262 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4263 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4264 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4265 (c) == '+' ? 62 : 63)
4266
4267/* What is the base-64 character of the bottom 6 bits of n? */
4268
4269#define TO_BASE64(n) \
4270 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4271
4272/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4273 * decoded as itself. We are permissive on decoding; the only ASCII
4274 * byte not decoding to itself is the + which begins a base64
4275 * string. */
4276
4277#define DECODE_DIRECT(c) \
4278 ((c) <= 127 && (c) != '+')
4279
4280/* The UTF-7 encoder treats ASCII characters differently according to
4281 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4282 * the above). See RFC2152. This array identifies these different
4283 * sets:
4284 * 0 : "Set D"
4285 * alphanumeric and '(),-./:?
4286 * 1 : "Set O"
4287 * !"#$%&*;<=>@[]^_`{|}
4288 * 2 : "whitespace"
4289 * ht nl cr sp
4290 * 3 : special (must be base64 encoded)
4291 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4292 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295char utf7_category[128] = {
4296/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4297 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4298/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4299 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4300/* sp ! " # $ % & ' ( ) * + , - . / */
4301 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4302/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4304/* @ A B C D E F G H I J K L M N O */
4305 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4306/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4308/* ` a b c d e f g h i j k l m n o */
4309 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4310/* p q r s t u v w x y z { | } ~ del */
4311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312};
4313
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314/* ENCODE_DIRECT: this character should be encoded as itself. The
4315 * answer depends on whether we are encoding set O as itself, and also
4316 * on whether we are encoding whitespace as itself. RFC2152 makes it
4317 * clear that the answers to these questions vary between
4318 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004319
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320#define ENCODE_DIRECT(c, directO, directWS) \
4321 ((c) < 128 && (c) > 0 && \
4322 ((utf7_category[(c)] == 0) || \
4323 (directWS && (utf7_category[(c)] == 2)) || \
4324 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325
Alexander Belopolsky40018472011-02-26 01:02:56 +00004326PyObject *
4327PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004328 Py_ssize_t size,
4329 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004331 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4332}
4333
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334/* The decoder. The only state we preserve is our read position,
4335 * i.e. how many characters we have consumed. So if we end in the
4336 * middle of a shift sequence we have to back off the read position
4337 * and the output to the beginning of the sequence, otherwise we lose
4338 * all the shift state (seen bits, number of bits seen, high
4339 * surrogate). */
4340
Alexander Belopolsky40018472011-02-26 01:02:56 +00004341PyObject *
4342PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004343 Py_ssize_t size,
4344 const char *errors,
4345 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004348 Py_ssize_t startinpos;
4349 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352 const char *errmsg = "";
4353 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004354 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 unsigned int base64bits = 0;
4356 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004357 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 PyObject *errorHandler = NULL;
4359 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 if (size == 0) {
4362 if (consumed)
4363 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004364 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004365 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004368 _PyUnicodeWriter_Init(&writer);
4369 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370
4371 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 e = s + size;
4373
4374 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004377 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (inShift) { /* in a base-64 section */
4380 if (IS_BASE64(ch)) { /* consume a base-64 character */
4381 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4382 base64bits += 6;
4383 s++;
4384 if (base64bits >= 16) {
4385 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004386 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 base64bits -= 16;
4388 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004389 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 if (surrogate) {
4391 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004392 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4393 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004394 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004397 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 }
4399 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004400 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004401 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 }
4404 }
Victor Stinner551ac952011-11-29 22:58:13 +01004405 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 /* first surrogate */
4407 surrogate = outCh;
4408 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004410 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 }
4413 }
4414 }
4415 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 if (base64bits > 0) { /* left-over bits */
4418 if (base64bits >= 6) {
4419 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004420 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 errmsg = "partial character in shift sequence";
4422 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 else {
4425 /* Some bits remain; they should be zero */
4426 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004427 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 errmsg = "non-zero padding bits in shift sequence";
4429 goto utf7Error;
4430 }
4431 }
4432 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004433 if (surrogate && DECODE_DIRECT(ch)) {
4434 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4435 goto onError;
4436 }
4437 surrogate = 0;
4438 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 /* '-' is absorbed; other terminating
4440 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004441 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 }
4444 }
4445 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 s++; /* consume '+' */
4448 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004450 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004451 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004453 else if (s < e && !IS_BASE64(*s)) {
4454 s++;
4455 errmsg = "ill-formed sequence";
4456 goto utf7Error;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004460 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004463 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 }
4465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004468 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 else {
4472 startinpos = s-starts;
4473 s++;
4474 errmsg = "unexpected special character";
4475 goto utf7Error;
4476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 errors, &errorHandler,
4482 "utf7", errmsg,
4483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 }
4487
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* end of string */
4489
4490 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4491 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004492 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (surrogate ||
4494 (base64bits >= 6) ||
4495 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 errors, &errorHandler,
4499 "utf7", "unterminated shift sequence",
4500 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 goto onError;
4503 if (s < e)
4504 goto restart;
4505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507
4508 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004509 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004511 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004512 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004513 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004514 writer.kind, writer.data, shiftOutStart);
4515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
4517 _PyUnicodeWriter_Dealloc(&writer);
4518 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004519 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004520 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 }
4522 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004523 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 Py_XDECREF(errorHandler);
4528 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 Py_XDECREF(errorHandler);
4533 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 return NULL;
4536}
4537
4538
Alexander Belopolsky40018472011-02-26 01:02:56 +00004539PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540_PyUnicode_EncodeUTF7(PyObject *str,
4541 int base64SetO,
4542 int base64WhiteSpace,
4543 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004545 int kind;
4546 void *data;
4547 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004548 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004550 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 unsigned int base64bits = 0;
4552 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 char * out;
4554 char * start;
4555
Benjamin Petersonbac79492012-01-14 13:34:47 -05004556 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004557 return NULL;
4558 kind = PyUnicode_KIND(str);
4559 data = PyUnicode_DATA(str);
4560 len = PyUnicode_GET_LENGTH(str);
4561
4562 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004565 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004566 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004567 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004568 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569 if (v == NULL)
4570 return NULL;
4571
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004572 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004573 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004574 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 if (inShift) {
4577 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4578 /* shifting out */
4579 if (base64bits) { /* output remaining bits */
4580 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4581 base64buffer = 0;
4582 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 }
4584 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 /* Characters not in the BASE64 set implicitly unshift the sequence
4586 so no '-' is required, except if the character is itself a '-' */
4587 if (IS_BASE64(ch) || ch == '-') {
4588 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 *out++ = (char) ch;
4591 }
4592 else {
4593 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004594 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 else { /* not in a shift sequence */
4597 if (ch == '+') {
4598 *out++ = '+';
4599 *out++ = '-';
4600 }
4601 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4602 *out++ = (char) ch;
4603 }
4604 else {
4605 *out++ = '+';
4606 inShift = 1;
4607 goto encode_char;
4608 }
4609 }
4610 continue;
4611encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004613 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004614
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* code first surrogate */
4616 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004617 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 while (base64bits >= 6) {
4619 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4620 base64bits -= 6;
4621 }
4622 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004623 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 base64bits += 16;
4626 base64buffer = (base64buffer << 16) | ch;
4627 while (base64bits >= 6) {
4628 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4629 base64bits -= 6;
4630 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004631 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 if (base64bits)
4633 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4634 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004636 if (_PyBytes_Resize(&v, out - start) < 0)
4637 return NULL;
4638 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004640PyObject *
4641PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4642 Py_ssize_t size,
4643 int base64SetO,
4644 int base64WhiteSpace,
4645 const char *errors)
4646{
4647 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004648 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004649 if (tmp == NULL)
4650 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004651 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652 base64WhiteSpace, errors);
4653 Py_DECREF(tmp);
4654 return result;
4655}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657#undef IS_BASE64
4658#undef FROM_BASE64
4659#undef TO_BASE64
4660#undef DECODE_DIRECT
4661#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663/* --- UTF-8 Codec -------------------------------------------------------- */
4664
Alexander Belopolsky40018472011-02-26 01:02:56 +00004665PyObject *
4666PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004667 Py_ssize_t size,
4668 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669{
Walter Dörwald69652032004-09-07 20:24:22 +00004670 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4671}
4672
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673#include "stringlib/asciilib.h"
4674#include "stringlib/codecs.h"
4675#include "stringlib/undef.h"
4676
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004677#include "stringlib/ucs1lib.h"
4678#include "stringlib/codecs.h"
4679#include "stringlib/undef.h"
4680
4681#include "stringlib/ucs2lib.h"
4682#include "stringlib/codecs.h"
4683#include "stringlib/undef.h"
4684
4685#include "stringlib/ucs4lib.h"
4686#include "stringlib/codecs.h"
4687#include "stringlib/undef.h"
4688
Antoine Pitrouab868312009-01-10 15:40:25 +00004689/* Mask to quickly check whether a C 'long' contains a
4690 non-ASCII, UTF8-encoded char. */
4691#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004692# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004693#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004694# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004695#else
4696# error C 'long' size should be either 4 or 8!
4697#endif
4698
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699static Py_ssize_t
4700ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004703 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004705 /*
4706 * Issue #17237: m68k is a bit different from most architectures in
4707 * that objects do not use "natural alignment" - for example, int and
4708 * long are only aligned at 2-byte boundaries. Therefore the assert()
4709 * won't work; also, tests have shown that skipping the "optimised
4710 * version" will even speed up m68k.
4711 */
4712#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004714 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4715 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 /* Fast path, see in STRINGLIB(utf8_decode) for
4717 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004718 /* Help allocation */
4719 const char *_p = p;
4720 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 while (_p < aligned_end) {
4722 unsigned long value = *(const unsigned long *) _p;
4723 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 *((unsigned long *)q) = value;
4726 _p += SIZEOF_LONG;
4727 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004728 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 p = _p;
4730 while (p < end) {
4731 if ((unsigned char)*p & 0x80)
4732 break;
4733 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004738#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 while (p < end) {
4740 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4741 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004742 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004743 /* Help allocation */
4744 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 while (_p < aligned_end) {
4746 unsigned long value = *(unsigned long *) _p;
4747 if (value & ASCII_CHAR_MASK)
4748 break;
4749 _p += SIZEOF_LONG;
4750 }
4751 p = _p;
4752 if (_p == end)
4753 break;
4754 }
4755 if ((unsigned char)*p & 0x80)
4756 break;
4757 ++p;
4758 }
4759 memcpy(dest, start, p - start);
4760 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761}
Antoine Pitrouab868312009-01-10 15:40:25 +00004762
Victor Stinner785938e2011-12-11 20:09:03 +01004763PyObject *
4764PyUnicode_DecodeUTF8Stateful(const char *s,
4765 Py_ssize_t size,
4766 const char *errors,
4767 Py_ssize_t *consumed)
4768{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004770 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772
4773 Py_ssize_t startinpos;
4774 Py_ssize_t endinpos;
4775 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004776 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004778 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004779
4780 if (size == 0) {
4781 if (consumed)
4782 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004783 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004784 }
4785
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4787 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004788 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 *consumed = 1;
4790 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004791 }
4792
Victor Stinner8f674cc2013-04-17 23:02:17 +02004793 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004794 writer.min_length = size;
4795 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004796 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004797
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 writer.pos = ascii_decode(s, end, writer.data);
4799 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 while (s < end) {
4801 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004803
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 if (PyUnicode_IS_ASCII(writer.buffer))
4806 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004810 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 } else {
4812 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004813 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 }
4815
4816 switch (ch) {
4817 case 0:
4818 if (s == end || consumed)
4819 goto End;
4820 errmsg = "unexpected end of data";
4821 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004822 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 break;
4824 case 1:
4825 errmsg = "invalid start byte";
4826 startinpos = s - starts;
4827 endinpos = startinpos + 1;
4828 break;
4829 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004830 case 3:
4831 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 errmsg = "invalid continuation byte";
4833 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004834 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 break;
4836 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004837 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 goto onError;
4839 continue;
4840 }
4841
Victor Stinner1d65d912015-10-05 13:43:50 +02004842 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004843 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004844
4845 switch (error_handler) {
4846 case _Py_ERROR_IGNORE:
4847 s += (endinpos - startinpos);
4848 break;
4849
4850 case _Py_ERROR_REPLACE:
4851 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4852 goto onError;
4853 s += (endinpos - startinpos);
4854 break;
4855
4856 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004857 {
4858 Py_ssize_t i;
4859
Victor Stinner1d65d912015-10-05 13:43:50 +02004860 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4861 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004862 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004863 ch = (Py_UCS4)(unsigned char)(starts[i]);
4864 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4865 ch + 0xdc00);
4866 writer.pos++;
4867 }
4868 s += (endinpos - startinpos);
4869 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004870 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004871
4872 default:
4873 if (unicode_decode_call_errorhandler_writer(
4874 errors, &error_handler_obj,
4875 "utf-8", errmsg,
4876 &starts, &end, &startinpos, &endinpos, &exc, &s,
4877 &writer))
4878 goto onError;
4879 }
Victor Stinner785938e2011-12-11 20:09:03 +01004880 }
4881
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883 if (consumed)
4884 *consumed = s - starts;
4885
Victor Stinner1d65d912015-10-05 13:43:50 +02004886 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004888 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889
4890onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004893 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004895}
4896
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004898/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4899 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004900
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004901 On success, write a pointer to a newly allocated wide character string into
4902 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4903 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004904
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004905 On memory allocation failure, return -1.
4906
4907 On decoding error (if surrogateescape is zero), return -2. If wlen is
4908 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4909 is not NULL, write the decoding error message into *reason. */
4910int
4911_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004912 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004913{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004914 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004915 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 wchar_t *unicode;
4917 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004918
Victor Stinner3d4226a2018-08-29 22:21:32 +02004919 int surrogateescape = 0;
4920 int surrogatepass = 0;
4921 switch (errors)
4922 {
4923 case _Py_ERROR_STRICT:
4924 break;
4925 case _Py_ERROR_SURROGATEESCAPE:
4926 surrogateescape = 1;
4927 break;
4928 case _Py_ERROR_SURROGATEPASS:
4929 surrogatepass = 1;
4930 break;
4931 default:
4932 return -3;
4933 }
4934
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004935 /* Note: size will always be longer than the resulting Unicode
4936 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004937 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004938 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004939 }
4940
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004941 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004942 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004943 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004944 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004945
4946 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004947 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004949 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004951#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004955#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (ch > 0xFF) {
4957#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004958 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004960 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004961 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4963 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4964#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004965 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004967 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004969 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004970
4971 if (surrogateescape) {
4972 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4973 }
4974 else {
4975 /* Is it a valid three-byte code? */
4976 if (surrogatepass
4977 && (e - s) >= 3
4978 && (s[0] & 0xf0) == 0xe0
4979 && (s[1] & 0xc0) == 0x80
4980 && (s[2] & 0xc0) == 0x80)
4981 {
4982 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4983 s += 3;
4984 unicode[outpos++] = ch;
4985 }
4986 else {
4987 PyMem_RawFree(unicode );
4988 if (reason != NULL) {
4989 switch (ch) {
4990 case 0:
4991 *reason = "unexpected end of data";
4992 break;
4993 case 1:
4994 *reason = "invalid start byte";
4995 break;
4996 /* 2, 3, 4 */
4997 default:
4998 *reason = "invalid continuation byte";
4999 break;
5000 }
5001 }
5002 if (wlen != NULL) {
5003 *wlen = s - orig_s;
5004 }
5005 return -2;
5006 }
5007 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005009 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005011 if (wlen) {
5012 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005013 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005014 *wstr = unicode;
5015 return 0;
5016}
5017
5018wchar_t*
5019_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5020{
5021 wchar_t *wstr;
5022 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5023 if (res != 0) {
5024 return NULL;
5025 }
5026 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027}
5028
Antoine Pitrouab868312009-01-10 15:40:25 +00005029
Victor Stinnere47e6982017-12-21 15:45:16 +01005030/* UTF-8 encoder using the surrogateescape error handler .
5031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On success, return 0 and write the newly allocated character string (use
5033 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005034
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005035 On encoding failure, return -2 and write the position of the invalid
5036 surrogate character into *error_pos (if error_pos is set) and the decoding
5037 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005038
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005039 On memory allocation failure, return -1. */
5040int
5041_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005042 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005043{
5044 const Py_ssize_t max_char_size = 4;
5045 Py_ssize_t len = wcslen(text);
5046
5047 assert(len >= 0);
5048
Victor Stinner3d4226a2018-08-29 22:21:32 +02005049 int surrogateescape = 0;
5050 int surrogatepass = 0;
5051 switch (errors)
5052 {
5053 case _Py_ERROR_STRICT:
5054 break;
5055 case _Py_ERROR_SURROGATEESCAPE:
5056 surrogateescape = 1;
5057 break;
5058 case _Py_ERROR_SURROGATEPASS:
5059 surrogatepass = 1;
5060 break;
5061 default:
5062 return -3;
5063 }
5064
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005065 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5066 return -1;
5067 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005068 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005069 if (raw_malloc) {
5070 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005071 }
5072 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005073 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005074 }
5075 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005077 }
5078
5079 char *p = bytes;
5080 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005081 for (i = 0; i < len; ) {
5082 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005084 i++;
5085#if Py_UNICODE_SIZE == 2
5086 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5087 && i < len
5088 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5089 {
5090 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5091 i++;
5092 }
5093#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005094
5095 if (ch < 0x80) {
5096 /* Encode ASCII */
5097 *p++ = (char) ch;
5098
5099 }
5100 else if (ch < 0x0800) {
5101 /* Encode Latin-1 */
5102 *p++ = (char)(0xc0 | (ch >> 6));
5103 *p++ = (char)(0x80 | (ch & 0x3f));
5104 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005105 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005108 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005110 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005111 if (reason != NULL) {
5112 *reason = "encoding error";
5113 }
5114 if (raw_malloc) {
5115 PyMem_RawFree(bytes);
5116 }
5117 else {
5118 PyMem_Free(bytes);
5119 }
5120 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005121 }
5122 *p++ = (char)(ch & 0xff);
5123 }
5124 else if (ch < 0x10000) {
5125 *p++ = (char)(0xe0 | (ch >> 12));
5126 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5127 *p++ = (char)(0x80 | (ch & 0x3f));
5128 }
5129 else { /* ch >= 0x10000 */
5130 assert(ch <= MAX_UNICODE);
5131 /* Encode UCS4 Unicode ordinals */
5132 *p++ = (char)(0xf0 | (ch >> 18));
5133 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5134 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5135 *p++ = (char)(0x80 | (ch & 0x3f));
5136 }
5137 }
5138 *p++ = '\0';
5139
5140 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005141 char *bytes2;
5142 if (raw_malloc) {
5143 bytes2 = PyMem_RawRealloc(bytes, final_size);
5144 }
5145 else {
5146 bytes2 = PyMem_Realloc(bytes, final_size);
5147 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005148 if (bytes2 == NULL) {
5149 if (error_pos != NULL) {
5150 *error_pos = (size_t)-1;
5151 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005152 if (raw_malloc) {
5153 PyMem_RawFree(bytes);
5154 }
5155 else {
5156 PyMem_Free(bytes);
5157 }
5158 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 *str = bytes2;
5161 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005162}
5163
5164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165/* Primary internal function which creates utf8 encoded bytes objects.
5166
5167 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005168 and allocate exactly as much space needed at the end. Else allocate the
5169 maximum possible needed (4 result bytes per Unicode character), and return
5170 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005171*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005172PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005173_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Victor Stinner6099a032011-12-18 14:22:26 +01005175 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176 void *data;
5177 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179 if (!PyUnicode_Check(unicode)) {
5180 PyErr_BadArgument();
5181 return NULL;
5182 }
5183
5184 if (PyUnicode_READY(unicode) == -1)
5185 return NULL;
5186
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005187 if (PyUnicode_UTF8(unicode))
5188 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5189 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005190
5191 kind = PyUnicode_KIND(unicode);
5192 data = PyUnicode_DATA(unicode);
5193 size = PyUnicode_GET_LENGTH(unicode);
5194
Benjamin Petersonead6b532011-12-20 17:23:42 -06005195 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005196 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005197 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005198 case PyUnicode_1BYTE_KIND:
5199 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5200 assert(!PyUnicode_IS_ASCII(unicode));
5201 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5202 case PyUnicode_2BYTE_KIND:
5203 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5204 case PyUnicode_4BYTE_KIND:
5205 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207}
5208
Alexander Belopolsky40018472011-02-26 01:02:56 +00005209PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5211 Py_ssize_t size,
5212 const char *errors)
5213{
5214 PyObject *v, *unicode;
5215
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005216 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217 if (unicode == NULL)
5218 return NULL;
5219 v = _PyUnicode_AsUTF8String(unicode, errors);
5220 Py_DECREF(unicode);
5221 return v;
5222}
5223
5224PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005225PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005227 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228}
5229
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230/* --- UTF-32 Codec ------------------------------------------------------- */
5231
5232PyObject *
5233PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 Py_ssize_t size,
5235 const char *errors,
5236 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237{
5238 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5239}
5240
5241PyObject *
5242PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 Py_ssize_t size,
5244 const char *errors,
5245 int *byteorder,
5246 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005247{
5248 const char *starts = s;
5249 Py_ssize_t startinpos;
5250 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005251 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005252 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005253 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005254 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256 PyObject *errorHandler = NULL;
5257 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005258
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259 q = (unsigned char *)s;
5260 e = q + size;
5261
5262 if (byteorder)
5263 bo = *byteorder;
5264
5265 /* Check for BOM marks (U+FEFF) in the input and adjust current
5266 byte order setting accordingly. In native mode, the leading BOM
5267 mark is skipped, in all other modes, it is copied to the output
5268 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005270 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 if (bom == 0x0000FEFF) {
5272 bo = -1;
5273 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005275 else if (bom == 0xFFFE0000) {
5276 bo = 1;
5277 q += 4;
5278 }
5279 if (byteorder)
5280 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005281 }
5282
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 if (q == e) {
5284 if (consumed)
5285 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005286 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005287 }
5288
Victor Stinnere64322e2012-10-30 23:12:47 +01005289#ifdef WORDS_BIGENDIAN
5290 le = bo < 0;
5291#else
5292 le = bo <= 0;
5293#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005295
Victor Stinner8f674cc2013-04-17 23:02:17 +02005296 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005297 writer.min_length = (e - q + 3) / 4;
5298 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005300
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 while (1) {
5302 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005303 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005304
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 enum PyUnicode_Kind kind = writer.kind;
5307 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005309 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 if (le) {
5311 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005312 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 if (ch > maxch)
5314 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (kind != PyUnicode_1BYTE_KIND &&
5316 Py_UNICODE_IS_SURROGATE(ch))
5317 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 q += 4;
5320 } while (q <= last);
5321 }
5322 else {
5323 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005324 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 if (ch > maxch)
5326 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 if (kind != PyUnicode_1BYTE_KIND &&
5328 Py_UNICODE_IS_SURROGATE(ch))
5329 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 q += 4;
5332 } while (q <= last);
5333 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005334 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 }
5336
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005338 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005339 startinpos = ((const char *)q) - starts;
5340 endinpos = startinpos + 4;
5341 }
5342 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005345 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 startinpos = ((const char *)q) - starts;
5348 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 else {
5351 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005353 goto onError;
5354 q += 4;
5355 continue;
5356 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005357 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005358 startinpos = ((const char *)q) - starts;
5359 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005361
5362 /* The remaining input chars are ignored if the callback
5363 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005366 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 }
5371
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375 Py_XDECREF(errorHandler);
5376 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005380 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 Py_XDECREF(errorHandler);
5382 Py_XDECREF(exc);
5383 return NULL;
5384}
5385
5386PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005387_PyUnicode_EncodeUTF32(PyObject *str,
5388 const char *errors,
5389 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 enum PyUnicode_Kind kind;
5392 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005394 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005395 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005396#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005400#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005403 PyObject *errorHandler = NULL;
5404 PyObject *exc = NULL;
5405 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (!PyUnicode_Check(str)) {
5408 PyErr_BadArgument();
5409 return NULL;
5410 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005411 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005412 return NULL;
5413 kind = PyUnicode_KIND(str);
5414 data = PyUnicode_DATA(str);
5415 len = PyUnicode_GET_LENGTH(str);
5416
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005418 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005420 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 if (v == NULL)
5422 return NULL;
5423
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 /* output buffer is 4-bytes aligned */
5425 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005426 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005432 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005436 else
5437 encoding = "utf-32";
5438
5439 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5441 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005442 }
5443
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444 pos = 0;
5445 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447
5448 if (kind == PyUnicode_2BYTE_KIND) {
5449 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5450 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005451 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 else {
5453 assert(kind == PyUnicode_4BYTE_KIND);
5454 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5455 &out, native_ordering);
5456 }
5457 if (pos == len)
5458 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005459
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 rep = unicode_encode_call_errorhandler(
5461 errors, &errorHandler,
5462 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 if (!rep)
5465 goto error;
5466
5467 if (PyBytes_Check(rep)) {
5468 repsize = PyBytes_GET_SIZE(rep);
5469 if (repsize & 3) {
5470 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005472 "surrogates not allowed");
5473 goto error;
5474 }
5475 moreunits = repsize / 4;
5476 }
5477 else {
5478 assert(PyUnicode_Check(rep));
5479 if (PyUnicode_READY(rep) < 0)
5480 goto error;
5481 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5482 if (!PyUnicode_IS_ASCII(rep)) {
5483 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005484 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 "surrogates not allowed");
5486 goto error;
5487 }
5488 }
5489
5490 /* four bytes are reserved for each surrogate */
5491 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005492 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005493 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 /* integer overflow */
5495 PyErr_NoMemory();
5496 goto error;
5497 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005498 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005500 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005501 }
5502
5503 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005504 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005508 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5509 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 }
5511
5512 Py_CLEAR(rep);
5513 }
5514
5515 /* Cut back to size actually needed. This is necessary for, for example,
5516 encoding of a string containing isolated surrogates and the 'ignore'
5517 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005518 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005519 if (nsize != PyBytes_GET_SIZE(v))
5520 _PyBytes_Resize(&v, nsize);
5521 Py_XDECREF(errorHandler);
5522 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005523 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005524 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005525 error:
5526 Py_XDECREF(rep);
5527 Py_XDECREF(errorHandler);
5528 Py_XDECREF(exc);
5529 Py_XDECREF(v);
5530 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531}
5532
Alexander Belopolsky40018472011-02-26 01:02:56 +00005533PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5535 Py_ssize_t size,
5536 const char *errors,
5537 int byteorder)
5538{
5539 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005540 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005541 if (tmp == NULL)
5542 return NULL;
5543 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5544 Py_DECREF(tmp);
5545 return result;
5546}
5547
5548PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550{
Victor Stinnerb960b342011-11-20 19:12:52 +01005551 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005552}
5553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554/* --- UTF-16 Codec ------------------------------------------------------- */
5555
Tim Peters772747b2001-08-09 22:21:55 +00005556PyObject *
5557PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 Py_ssize_t size,
5559 const char *errors,
5560 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561{
Walter Dörwald69652032004-09-07 20:24:22 +00005562 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5563}
5564
5565PyObject *
5566PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 Py_ssize_t size,
5568 const char *errors,
5569 int *byteorder,
5570 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005571{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005573 Py_ssize_t startinpos;
5574 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005576 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005577 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005579 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 PyObject *errorHandler = NULL;
5581 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
Tim Peters772747b2001-08-09 22:21:55 +00005584 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005585 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
5587 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005588 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005590 /* Check for BOM marks (U+FEFF) in the input and adjust current
5591 byte order setting accordingly. In native mode, the leading BOM
5592 mark is skipped, in all other modes, it is copied to the output
5593 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 if (bo == 0 && size >= 2) {
5595 const Py_UCS4 bom = (q[1] << 8) | q[0];
5596 if (bom == 0xFEFF) {
5597 q += 2;
5598 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 else if (bom == 0xFFFE) {
5601 q += 2;
5602 bo = 1;
5603 }
5604 if (byteorder)
5605 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 if (q == e) {
5609 if (consumed)
5610 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005611 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005612 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613
Christian Heimes743e0cd2012-10-17 23:52:17 +02005614#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005617#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005619 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005620#endif
Tim Peters772747b2001-08-09 22:21:55 +00005621
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005623 character count normally. Error handler will take care of
5624 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005625 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005626 writer.min_length = (e - q + 1) / 2;
5627 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 while (1) {
5631 Py_UCS4 ch = 0;
5632 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 else
5640 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering);
5643 } else if (kind == PyUnicode_2BYTE_KIND) {
5644 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 native_ordering);
5647 } else {
5648 assert(kind == PyUnicode_4BYTE_KIND);
5649 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005650 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005652 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654
Antoine Pitrou63065d72012-05-15 23:48:04 +02005655 switch (ch)
5656 {
5657 case 0:
5658 /* remaining byte at the end? (size should be even) */
5659 if (q == e || consumed)
5660 goto End;
5661 errmsg = "truncated data";
5662 startinpos = ((const char *)q) - starts;
5663 endinpos = ((const char *)e) - starts;
5664 break;
5665 /* The remaining input chars are ignored if the callback
5666 chooses to skip the input */
5667 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005668 q -= 2;
5669 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005670 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005672 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005673 endinpos = ((const char *)e) - starts;
5674 break;
5675 case 2:
5676 errmsg = "illegal encoding";
5677 startinpos = ((const char *)q) - 2 - starts;
5678 endinpos = startinpos + 2;
5679 break;
5680 case 3:
5681 errmsg = "illegal UTF-16 surrogate";
5682 startinpos = ((const char *)q) - 4 - starts;
5683 endinpos = startinpos + 2;
5684 break;
5685 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005686 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 continue;
5689 }
5690
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005692 errors,
5693 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005694 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005695 &starts,
5696 (const char **)&e,
5697 &startinpos,
5698 &endinpos,
5699 &exc,
5700 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
5704
Antoine Pitrou63065d72012-05-15 23:48:04 +02005705End:
Walter Dörwald69652032004-09-07 20:24:22 +00005706 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 Py_XDECREF(errorHandler);
5710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005711 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005714 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 Py_XDECREF(errorHandler);
5716 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 return NULL;
5718}
5719
Tim Peters772747b2001-08-09 22:21:55 +00005720PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721_PyUnicode_EncodeUTF16(PyObject *str,
5722 const char *errors,
5723 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 enum PyUnicode_Kind kind;
5726 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005727 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005728 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005729 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005731#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005732 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005733#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005734 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005735#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 const char *encoding;
5737 Py_ssize_t nsize, pos;
5738 PyObject *errorHandler = NULL;
5739 PyObject *exc = NULL;
5740 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005741
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 if (!PyUnicode_Check(str)) {
5743 PyErr_BadArgument();
5744 return NULL;
5745 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005746 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747 return NULL;
5748 kind = PyUnicode_KIND(str);
5749 data = PyUnicode_DATA(str);
5750 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005751
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005753 if (kind == PyUnicode_4BYTE_KIND) {
5754 const Py_UCS4 *in = (const Py_UCS4 *)data;
5755 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 while (in < end) {
5757 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
5760 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 nsize = len + pairs + (byteorder == 0);
5766 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005772 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005773 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005775 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
Tim Peters772747b2001-08-09 22:21:55 +00005780
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 if (kind == PyUnicode_1BYTE_KIND) {
5782 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5783 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005784 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005785
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005786 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005788 }
5789 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005791 }
5792 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005794 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795
5796 pos = 0;
5797 while (pos < len) {
5798 Py_ssize_t repsize, moreunits;
5799
5800 if (kind == PyUnicode_2BYTE_KIND) {
5801 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5802 &out, native_ordering);
5803 }
5804 else {
5805 assert(kind == PyUnicode_4BYTE_KIND);
5806 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5807 &out, native_ordering);
5808 }
5809 if (pos == len)
5810 break;
5811
5812 rep = unicode_encode_call_errorhandler(
5813 errors, &errorHandler,
5814 encoding, "surrogates not allowed",
5815 str, &exc, pos, pos + 1, &pos);
5816 if (!rep)
5817 goto error;
5818
5819 if (PyBytes_Check(rep)) {
5820 repsize = PyBytes_GET_SIZE(rep);
5821 if (repsize & 1) {
5822 raise_encode_exception(&exc, encoding,
5823 str, pos - 1, pos,
5824 "surrogates not allowed");
5825 goto error;
5826 }
5827 moreunits = repsize / 2;
5828 }
5829 else {
5830 assert(PyUnicode_Check(rep));
5831 if (PyUnicode_READY(rep) < 0)
5832 goto error;
5833 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5834 if (!PyUnicode_IS_ASCII(rep)) {
5835 raise_encode_exception(&exc, encoding,
5836 str, pos - 1, pos,
5837 "surrogates not allowed");
5838 goto error;
5839 }
5840 }
5841
5842 /* two bytes are reserved for each surrogate */
5843 if (moreunits > 1) {
5844 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005845 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 /* integer overflow */
5847 PyErr_NoMemory();
5848 goto error;
5849 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005850 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 goto error;
5852 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5853 }
5854
5855 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005856 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005857 out += moreunits;
5858 } else /* rep is unicode */ {
5859 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5860 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5861 &out, native_ordering);
5862 }
5863
5864 Py_CLEAR(rep);
5865 }
5866
5867 /* Cut back to size actually needed. This is necessary for, for example,
5868 encoding of a string containing isolated surrogates and the 'ignore' handler
5869 is used. */
5870 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5871 if (nsize != PyBytes_GET_SIZE(v))
5872 _PyBytes_Resize(&v, nsize);
5873 Py_XDECREF(errorHandler);
5874 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005875 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 error:
5878 Py_XDECREF(rep);
5879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
5881 Py_XDECREF(v);
5882 return NULL;
5883#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5888 Py_ssize_t size,
5889 const char *errors,
5890 int byteorder)
5891{
5892 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005893 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 if (tmp == NULL)
5895 return NULL;
5896 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5897 Py_DECREF(tmp);
5898 return result;
5899}
5900
5901PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005902PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905}
5906
5907/* --- Unicode Escape Codec ----------------------------------------------- */
5908
Fredrik Lundh06d12682001-01-24 07:59:11 +00005909static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005910
Alexander Belopolsky40018472011-02-26 01:02:56 +00005911PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005912_PyUnicode_DecodeUnicodeEscape(const char *s,
5913 Py_ssize_t size,
5914 const char *errors,
5915 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005918 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 PyObject *errorHandler = NULL;
5921 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005922
Eric V. Smith42454af2016-10-31 09:22:08 -04005923 // so we can remember if we've seen an invalid escape char or not
5924 *first_invalid_escape = NULL;
5925
Victor Stinner62ec3312016-09-06 17:04:34 -07005926 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005927 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005928 }
5929 /* Escaped strings will always be longer than the resulting
5930 Unicode string, so we start with size here and then reduce the
5931 length after conversion to the true value.
5932 (but if the error callback returns a long replacement string
5933 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005934 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005935 writer.min_length = size;
5936 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5937 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005938 }
5939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 end = s + size;
5941 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005942 unsigned char c = (unsigned char) *s++;
5943 Py_UCS4 ch;
5944 int count;
5945 Py_ssize_t startinpos;
5946 Py_ssize_t endinpos;
5947 const char *message;
5948
5949#define WRITE_ASCII_CHAR(ch) \
5950 do { \
5951 assert(ch <= 127); \
5952 assert(writer.pos < writer.size); \
5953 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5954 } while(0)
5955
5956#define WRITE_CHAR(ch) \
5957 do { \
5958 if (ch <= writer.maxchar) { \
5959 assert(writer.pos < writer.size); \
5960 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5961 } \
5962 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5963 goto onError; \
5964 } \
5965 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 if (c != '\\') {
5969 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 continue;
5971 }
5972
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 if (s >= end) {
5976 message = "\\ at end of string";
5977 goto error;
5978 }
5979 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005980
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case '\n': continue;
5986 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5987 case '\'': WRITE_ASCII_CHAR('\''); continue;
5988 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5989 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5992 case 't': WRITE_ASCII_CHAR('\t'); continue;
5993 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5994 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005995 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 case '0': case '1': case '2': case '3':
6002 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006004 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 ch = (ch<<3) + *s++ - '0';
6006 if (s < end && '0' <= *s && *s <= '7') {
6007 ch = (ch<<3) + *s++ - '0';
6008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 WRITE_CHAR(ch);
6011 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* hex escapes */
6014 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017 message = "truncated \\xXX escape";
6018 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006023 message = "truncated \\uXXXX escape";
6024 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006027 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029 message = "truncated \\UXXXXXXXX escape";
6030 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006032 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006033 ch <<= 4;
6034 if (c >= '0' && c <= '9') {
6035 ch += c - '0';
6036 }
6037 else if (c >= 'a' && c <= 'f') {
6038 ch += c - ('a' - 10);
6039 }
6040 else if (c >= 'A' && c <= 'F') {
6041 ch += c - ('A' - 10);
6042 }
6043 else {
6044 break;
6045 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006046 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006048 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006049 }
6050
6051 /* when we get here, ch is a 32-bit unicode character */
6052 if (ch > MAX_UNICODE) {
6053 message = "illegal Unicode character";
6054 goto error;
6055 }
6056
6057 WRITE_CHAR(ch);
6058 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 if (ucnhash_CAPI == NULL) {
6063 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006064 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6065 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 if (ucnhash_CAPI == NULL) {
6067 PyErr_SetString(
6068 PyExc_UnicodeError,
6069 "\\N escapes not supported (can't load unicodedata module)"
6070 );
6071 goto onError;
6072 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006074
6075 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006076 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 const char *start = ++s;
6078 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 namelen = s - start;
6083 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 ch = 0xffffffff; /* in case 'getcode' messes up */
6087 if (namelen <= INT_MAX &&
6088 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6089 &ch, 0)) {
6090 assert(ch <= MAX_UNICODE);
6091 WRITE_CHAR(ch);
6092 continue;
6093 }
6094 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006095 }
6096 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006098
6099 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006100 if (*first_invalid_escape == NULL) {
6101 *first_invalid_escape = s-1; /* Back up one char, since we've
6102 already incremented s. */
6103 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 WRITE_ASCII_CHAR('\\');
6105 WRITE_CHAR(c);
6106 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006108
6109 error:
6110 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006112 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006113 errors, &errorHandler,
6114 "unicodeescape", message,
6115 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006116 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006117 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006118 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006119 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006120
6121#undef WRITE_ASCII_CHAR
6122#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006124
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006128
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006130 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 return NULL;
6134}
6135
Eric V. Smith42454af2016-10-31 09:22:08 -04006136PyObject *
6137PyUnicode_DecodeUnicodeEscape(const char *s,
6138 Py_ssize_t size,
6139 const char *errors)
6140{
6141 const char *first_invalid_escape;
6142 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6143 &first_invalid_escape);
6144 if (result == NULL)
6145 return NULL;
6146 if (first_invalid_escape != NULL) {
6147 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6148 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006149 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006150 Py_DECREF(result);
6151 return NULL;
6152 }
6153 }
6154 return result;
6155}
6156
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006157/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
Ezio Melottie7f90372012-10-05 03:33:31 +03006169 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006170 escape.
6171
Ezio Melottie7f90372012-10-05 03:33:31 +03006172 For UCS1 strings it's '\xxx', 4 bytes per source character.
6173 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6174 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006175 */
6176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (!PyUnicode_Check(unicode)) {
6178 PyErr_BadArgument();
6179 return NULL;
6180 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 }
Victor Stinner358af132015-10-12 22:36:57 +02006184
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 if (len == 0) {
6187 return PyBytes_FromStringAndSize(NULL, 0);
6188 }
6189
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 kind = PyUnicode_KIND(unicode);
6191 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6193 bytes, and 1 byte characters 4. */
6194 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006195 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 return PyErr_NoMemory();
6197 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006198 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 if (repr == NULL) {
6200 return NULL;
6201 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006205 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006206
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 /* U+0000-U+00ff range */
6208 if (ch < 0x100) {
6209 if (ch >= ' ' && ch < 127) {
6210 if (ch != '\\') {
6211 /* Copy printable US ASCII as-is */
6212 *p++ = (char) ch;
6213 }
6214 /* Escape backslashes */
6215 else {
6216 *p++ = '\\';
6217 *p++ = '\\';
6218 }
6219 }
Victor Stinner358af132015-10-12 22:36:57 +02006220
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 /* Map special whitespace to '\t', \n', '\r' */
6222 else if (ch == '\t') {
6223 *p++ = '\\';
6224 *p++ = 't';
6225 }
6226 else if (ch == '\n') {
6227 *p++ = '\\';
6228 *p++ = 'n';
6229 }
6230 else if (ch == '\r') {
6231 *p++ = '\\';
6232 *p++ = 'r';
6233 }
6234
6235 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6236 else {
6237 *p++ = '\\';
6238 *p++ = 'x';
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
6241 }
Tim Petersced69f82003-09-16 20:30:58 +00006242 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006243 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 *p++ = '\\';
6246 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006247 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6248 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6249 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6250 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6253 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006254
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 /* Make sure that the first two digits are zero */
6256 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 *p++ = 'U';
6259 *p++ = '0';
6260 *p++ = '0';
6261 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6262 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6263 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6264 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6265 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6266 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 assert(p - PyBytes_AS_STRING(repr) > 0);
6271 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6272 return NULL;
6273 }
6274 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
Alexander Belopolsky40018472011-02-26 01:02:56 +00006277PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6279 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006282 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006285 }
6286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006287 result = PyUnicode_AsUnicodeEscapeString(tmp);
6288 Py_DECREF(tmp);
6289 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290}
6291
6292/* --- Raw Unicode Escape Codec ------------------------------------------- */
6293
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294PyObject *
6295PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 Py_ssize_t size,
6297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006300 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 PyObject *errorHandler = NULL;
6303 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006304
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 /* Escaped strings will always be longer than the resulting
6310 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 length after conversion to the true value. (But decoding error
6312 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006313 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 writer.min_length = size;
6315 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6316 goto onError;
6317 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006318
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 end = s + size;
6320 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 unsigned char c = (unsigned char) *s++;
6322 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006323 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 Py_ssize_t startinpos;
6325 Py_ssize_t endinpos;
6326 const char *message;
6327
6328#define WRITE_CHAR(ch) \
6329 do { \
6330 if (ch <= writer.maxchar) { \
6331 assert(writer.pos < writer.size); \
6332 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6333 } \
6334 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6335 goto onError; \
6336 } \
6337 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 if (c != '\\' || s >= end) {
6341 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006344
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 c = (unsigned char) *s++;
6346 if (c == 'u') {
6347 count = 4;
6348 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 else if (c == 'U') {
6351 count = 8;
6352 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006353 }
6354 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 assert(writer.pos < writer.size);
6356 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6357 WRITE_CHAR(c);
6358 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006359 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006360 startinpos = s - starts - 2;
6361
6362 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6363 for (ch = 0; count && s < end; ++s, --count) {
6364 c = (unsigned char)*s;
6365 ch <<= 4;
6366 if (c >= '0' && c <= '9') {
6367 ch += c - '0';
6368 }
6369 else if (c >= 'a' && c <= 'f') {
6370 ch += c - ('a' - 10);
6371 }
6372 else if (c >= 'A' && c <= 'F') {
6373 ch += c - ('A' - 10);
6374 }
6375 else {
6376 break;
6377 }
6378 }
6379 if (!count) {
6380 if (ch <= MAX_UNICODE) {
6381 WRITE_CHAR(ch);
6382 continue;
6383 }
6384 message = "\\Uxxxxxxxx out of range";
6385 }
6386
6387 endinpos = s-starts;
6388 writer.min_length = end - s + writer.pos;
6389 if (unicode_decode_call_errorhandler_writer(
6390 errors, &errorHandler,
6391 "rawunicodeescape", message,
6392 &starts, &end, &startinpos, &endinpos, &exc, &s,
6393 &writer)) {
6394 goto onError;
6395 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006396 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006397
6398#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 Py_XDECREF(errorHandler);
6401 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006403
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006405 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 Py_XDECREF(errorHandler);
6407 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410}
6411
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412
Alexander Belopolsky40018472011-02-26 01:02:56 +00006413PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415{
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 int kind;
6420 void *data;
6421 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 if (!PyUnicode_Check(unicode)) {
6424 PyErr_BadArgument();
6425 return NULL;
6426 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430 kind = PyUnicode_KIND(unicode);
6431 data = PyUnicode_DATA(unicode);
6432 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (kind == PyUnicode_1BYTE_KIND) {
6434 return PyBytes_FromStringAndSize(data, len);
6435 }
Victor Stinner0e368262011-11-10 20:12:49 +01006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6438 bytes, and 1 byte characters 4. */
6439 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006440
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 if (len > PY_SSIZE_T_MAX / expandsize) {
6442 return PyErr_NoMemory();
6443 }
6444 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6445 if (repr == NULL) {
6446 return NULL;
6447 }
6448 if (len == 0) {
6449 return repr;
6450 }
6451
6452 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 for (pos = 0; pos < len; pos++) {
6454 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006455
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6457 if (ch < 0x100) {
6458 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006459 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006460 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 *p++ = '\\';
6463 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006464 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6465 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6466 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6467 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6470 else {
6471 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6472 *p++ = '\\';
6473 *p++ = 'U';
6474 *p++ = '0';
6475 *p++ = '0';
6476 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6477 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6478 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6479 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6480 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6481 *p++ = Py_hexdigits[ch & 15];
6482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006484
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 assert(p > PyBytes_AS_STRING(repr));
6486 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6487 return NULL;
6488 }
6489 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490}
6491
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6494 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006497 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006499 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006500 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6501 Py_DECREF(tmp);
6502 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006505/* --- Unicode Internal Codec ------------------------------------------- */
6506
Alexander Belopolsky40018472011-02-26 01:02:56 +00006507PyObject *
6508_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006509 Py_ssize_t size,
6510 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511{
6512 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006513 Py_ssize_t startinpos;
6514 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006515 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006516 const char *end;
6517 const char *reason;
6518 PyObject *errorHandler = NULL;
6519 PyObject *exc = NULL;
6520
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006522 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006523 1))
6524 return NULL;
6525
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006526 if (size < 0) {
6527 PyErr_BadInternalCall();
6528 return NULL;
6529 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006530 if (size == 0)
6531 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006532
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 _PyUnicodeWriter_Init(&writer);
6534 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6535 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006537 }
6538 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006539
Victor Stinner8f674cc2013-04-17 23:02:17 +02006540 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006542 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006543 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006544 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006545 endinpos = end-starts;
6546 reason = "truncated input";
6547 goto error;
6548 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549 /* We copy the raw representation one byte at a time because the
6550 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006551 ((char *) &uch)[0] = s[0];
6552 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006553#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006554 ((char *) &uch)[2] = s[2];
6555 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006556#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006557 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006559 /* We have to sanity check the raw data, otherwise doom looms for
6560 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006561 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562 endinpos = s - starts + Py_UNICODE_SIZE;
6563 reason = "illegal code point (> 0x10FFFF)";
6564 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006566#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 s += Py_UNICODE_SIZE;
6568#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006569 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006571 Py_UNICODE uch2;
6572 ((char *) &uch2)[0] = s[0];
6573 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006574 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 {
Victor Stinner551ac952011-11-29 22:58:13 +01006576 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 }
6579 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006580#endif
6581
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006582 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006583 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006584 continue;
6585
6586 error:
6587 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 errors, &errorHandler,
6590 "unicode_internal", reason,
6591 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006592 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006593 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 }
6595
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006596 Py_XDECREF(errorHandler);
6597 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006598 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006601 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006602 Py_XDECREF(errorHandler);
6603 Py_XDECREF(exc);
6604 return NULL;
6605}
6606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607/* --- Latin-1 Codec ------------------------------------------------------ */
6608
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609PyObject *
6610PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 Py_ssize_t size,
6612 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006615 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616}
6617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619static void
6620make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006622 PyObject *unicode,
6623 Py_ssize_t startpos, Py_ssize_t endpos,
6624 const char *reason)
6625{
6626 if (*exceptionObject == NULL) {
6627 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 encoding, unicode, startpos, endpos, reason);
6630 }
6631 else {
6632 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6633 goto onError;
6634 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6635 goto onError;
6636 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6637 goto onError;
6638 return;
6639 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006640 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006641 }
6642}
6643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006645static void
6646raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006647 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006648 PyObject *unicode,
6649 Py_ssize_t startpos, Py_ssize_t endpos,
6650 const char *reason)
6651{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006652 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006653 encoding, unicode, startpos, endpos, reason);
6654 if (*exceptionObject != NULL)
6655 PyCodec_StrictErrors(*exceptionObject);
6656}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657
6658/* error handling callback helper:
6659 build arguments, call the callback and check the arguments,
6660 put the result into newpos and return the replacement string, which
6661 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static PyObject *
6663unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006664 PyObject **errorHandler,
6665 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006667 Py_ssize_t startpos, Py_ssize_t endpos,
6668 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006670 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 PyObject *restuple;
6673 PyObject *resunicode;
6674
6675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 }
6680
Benjamin Petersonbac79492012-01-14 13:34:47 -05006681 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 return NULL;
6683 len = PyUnicode_GET_LENGTH(unicode);
6684
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006685 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006690 restuple = PyObject_CallFunctionObjArgs(
6691 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 Py_DECREF(restuple);
6697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006699 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 &resunicode, newpos)) {
6701 Py_DECREF(restuple);
6702 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006704 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6705 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6706 Py_DECREF(restuple);
6707 return NULL;
6708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006710 *newpos = len + *newpos;
6711 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006712 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 Py_DECREF(restuple);
6714 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 Py_INCREF(resunicode);
6717 Py_DECREF(restuple);
6718 return resunicode;
6719}
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006723 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006724 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 /* input state */
6727 Py_ssize_t pos=0, size;
6728 int kind;
6729 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 /* pointer into the output */
6731 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006732 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6733 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006734 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006736 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006737 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006738 /* output object */
6739 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740
Benjamin Petersonbac79492012-01-14 13:34:47 -05006741 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 return NULL;
6743 size = PyUnicode_GET_LENGTH(unicode);
6744 kind = PyUnicode_KIND(unicode);
6745 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 /* allocate enough for a simple encoding without
6747 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006748 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006749 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006750
6751 _PyBytesWriter_Init(&writer);
6752 str = _PyBytesWriter_Alloc(&writer, size);
6753 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006754 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006757 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006760 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006762 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006766 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006769 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006771
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006772 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006774
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006775 /* Only overallocate the buffer if it's not the last write */
6776 writer.overallocate = (collend < size);
6777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006779 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006780 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006781
6782 switch (error_handler) {
6783 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006784 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006786
6787 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006788 memset(str, '?', collend - collstart);
6789 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006790 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006791 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 break;
Victor Stinner50149202015-09-22 00:26:54 +02006794
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006796 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006797 writer.min_size -= (collend - collstart);
6798 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006799 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800 if (str == NULL)
6801 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 pos = collend;
6803 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006804
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006806 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006807 writer.min_size -= (collend - collstart);
6808 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006809 unicode, collstart, collend);
6810 if (str == NULL)
6811 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 break;
Victor Stinner50149202015-09-22 00:26:54 +02006814
Victor Stinnerc3713e92015-09-29 12:32:13 +02006815 case _Py_ERROR_SURROGATEESCAPE:
6816 for (i = collstart; i < collend; ++i) {
6817 ch = PyUnicode_READ(kind, data, i);
6818 if (ch < 0xdc80 || 0xdcff < ch) {
6819 /* Not a UTF-8b surrogate */
6820 break;
6821 }
6822 *str++ = (char)(ch - 0xdc00);
6823 ++pos;
6824 }
6825 if (i >= collend)
6826 break;
6827 collstart = pos;
6828 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006829 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006830
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6833 encoding, reason, unicode, &exc,
6834 collstart, collend, &newpos);
6835 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006837
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006838 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006839 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006840
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006842 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006843 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 PyBytes_AS_STRING(rep),
6845 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006846 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847 else {
6848 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006849
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006852
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006853 if (limit == 256 ?
6854 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6855 !PyUnicode_IS_ASCII(rep))
6856 {
6857 /* Not all characters are smaller than limit */
6858 raise_encode_exception(&exc, encoding, unicode,
6859 collstart, collend, reason);
6860 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006862 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6863 str = _PyBytesWriter_WriteBytes(&writer, str,
6864 PyUnicode_DATA(rep),
6865 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006867 if (str == NULL)
6868 goto onError;
6869
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006871 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006873
6874 /* If overallocation was disabled, ensure that it was the last
6875 write. Otherwise, we missed an optimization */
6876 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 }
6878 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879
Victor Stinner50149202015-09-22 00:26:54 +02006880 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006883
6884 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006885 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006886 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006887 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006888 Py_XDECREF(exc);
6889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890}
6891
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006892/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893PyObject *
6894PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006895 Py_ssize_t size,
6896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006898 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006899 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 if (unicode == NULL)
6901 return NULL;
6902 result = unicode_encode_ucs1(unicode, errors, 256);
6903 Py_DECREF(unicode);
6904 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905}
6906
Alexander Belopolsky40018472011-02-26 01:02:56 +00006907PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006908_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909{
6910 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 PyErr_BadArgument();
6912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914 if (PyUnicode_READY(unicode) == -1)
6915 return NULL;
6916 /* Fast path: if it is a one-byte string, construct
6917 bytes object directly. */
6918 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6919 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6920 PyUnicode_GET_LENGTH(unicode));
6921 /* Non-Latin-1 characters present. Defer to above function to
6922 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924}
6925
6926PyObject*
6927PyUnicode_AsLatin1String(PyObject *unicode)
6928{
6929 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930}
6931
6932/* --- 7-bit ASCII Codec -------------------------------------------------- */
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934PyObject *
6935PyUnicode_DecodeASCII(const char *s,
6936 Py_ssize_t size,
6937 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006940 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006941 int kind;
6942 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006943 Py_ssize_t startinpos;
6944 Py_ssize_t endinpos;
6945 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006947 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006948 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006949 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006952 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006955 if (size == 1 && (unsigned char)s[0] < 128)
6956 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957
Victor Stinner8f674cc2013-04-17 23:02:17 +02006958 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006959 writer.min_length = size;
6960 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006961 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006965 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 writer.pos = outpos;
6967 if (writer.pos == size)
6968 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006969
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 s += writer.pos;
6971 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006972 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006973 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006975 PyUnicode_WRITE(kind, data, writer.pos, c);
6976 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006980
6981 /* byte outsize range 0x00..0x7f: call the error handler */
6982
6983 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006984 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985
6986 switch (error_handler)
6987 {
6988 case _Py_ERROR_REPLACE:
6989 case _Py_ERROR_SURROGATEESCAPE:
6990 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006991 but we may switch to UCS2 at the first write */
6992 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6993 goto onError;
6994 kind = writer.kind;
6995 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006996
6997 if (error_handler == _Py_ERROR_REPLACE)
6998 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6999 else
7000 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7001 writer.pos++;
7002 ++s;
7003 break;
7004
7005 case _Py_ERROR_IGNORE:
7006 ++s;
7007 break;
7008
7009 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 startinpos = s-starts;
7011 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007013 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 "ascii", "ordinal not in range(128)",
7015 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 kind = writer.kind;
7019 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007022 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007025
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007027 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007028 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 return NULL;
7031}
7032
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007034PyObject *
7035PyUnicode_EncodeASCII(const Py_UNICODE *p,
7036 Py_ssize_t size,
7037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007040 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007041 if (unicode == NULL)
7042 return NULL;
7043 result = unicode_encode_ucs1(unicode, errors, 128);
7044 Py_DECREF(unicode);
7045 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007049_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050{
7051 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 PyErr_BadArgument();
7053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 if (PyUnicode_READY(unicode) == -1)
7056 return NULL;
7057 /* Fast path: if it is an ASCII-only string, construct bytes object
7058 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007059 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007060 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7061 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007063}
7064
7065PyObject *
7066PyUnicode_AsASCIIString(PyObject *unicode)
7067{
7068 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069}
7070
Steve Dowercc16be82016-09-08 10:35:16 -07007071#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007072
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007073/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007074
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007075#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076#define NEED_RETRY
7077#endif
7078
Victor Stinner3a50e702011-10-18 21:21:00 +02007079#ifndef WC_ERR_INVALID_CHARS
7080# define WC_ERR_INVALID_CHARS 0x0080
7081#endif
7082
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007083static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007084code_page_name(UINT code_page, PyObject **obj)
7085{
7086 *obj = NULL;
7087 if (code_page == CP_ACP)
7088 return "mbcs";
7089 if (code_page == CP_UTF7)
7090 return "CP_UTF7";
7091 if (code_page == CP_UTF8)
7092 return "CP_UTF8";
7093
7094 *obj = PyBytes_FromFormat("cp%u", code_page);
7095 if (*obj == NULL)
7096 return NULL;
7097 return PyBytes_AS_STRING(*obj);
7098}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099
Victor Stinner3a50e702011-10-18 21:21:00 +02007100static DWORD
7101decode_code_page_flags(UINT code_page)
7102{
7103 if (code_page == CP_UTF7) {
7104 /* The CP_UTF7 decoder only supports flags=0 */
7105 return 0;
7106 }
7107 else
7108 return MB_ERR_INVALID_CHARS;
7109}
7110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 * Decode a byte string from a Windows code page into unicode object in strict
7113 * mode.
7114 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007115 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7116 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007119decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007120 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const char *in,
7122 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123{
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007125 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127
7128 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 assert(insize > 0);
7130 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7131 if (outsize <= 0)
7132 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133
7134 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007136 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007137 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 if (*v == NULL)
7139 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141 }
7142 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007145 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 }
7149
7150 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7152 if (outsize <= 0)
7153 goto error;
7154 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156error:
7157 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7158 return -2;
7159 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007160 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161}
7162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163/*
7164 * Decode a byte string from a code page into unicode object with an error
7165 * handler.
7166 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007167 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 * UnicodeDecodeError exception and returns -1 on error.
7169 */
7170static int
7171decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007172 PyObject **v,
7173 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007174 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007175{
7176 const char *startin = in;
7177 const char *endin = in + size;
7178 const DWORD flags = decode_code_page_flags(code_page);
7179 /* Ideally, we should get reason from FormatMessage. This is the Windows
7180 2000 English version of the message. */
7181 const char *reason = "No mapping for the Unicode character exists "
7182 "in the target code page.";
7183 /* each step cannot decode more than 1 character, but a character can be
7184 represented as a surrogate pair */
7185 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007186 int insize;
7187 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 PyObject *errorHandler = NULL;
7189 PyObject *exc = NULL;
7190 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007191 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 DWORD err;
7193 int ret = -1;
7194
7195 assert(size > 0);
7196
7197 encoding = code_page_name(code_page, &encoding_obj);
7198 if (encoding == NULL)
7199 return -1;
7200
Victor Stinner7d00cc12014-03-17 23:08:06 +01007201 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7203 UnicodeDecodeError. */
7204 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7205 if (exc != NULL) {
7206 PyCodec_StrictErrors(exc);
7207 Py_CLEAR(exc);
7208 }
7209 goto error;
7210 }
7211
7212 if (*v == NULL) {
7213 /* Create unicode object */
7214 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7215 PyErr_NoMemory();
7216 goto error;
7217 }
Victor Stinnerab595942011-12-17 04:59:06 +01007218 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007219 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 if (*v == NULL)
7221 goto error;
7222 startout = PyUnicode_AS_UNICODE(*v);
7223 }
7224 else {
7225 /* Extend unicode object */
7226 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7227 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7228 PyErr_NoMemory();
7229 goto error;
7230 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007231 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 goto error;
7233 startout = PyUnicode_AS_UNICODE(*v) + n;
7234 }
7235
7236 /* Decode the byte string character per character */
7237 out = startout;
7238 while (in < endin)
7239 {
7240 /* Decode a character */
7241 insize = 1;
7242 do
7243 {
7244 outsize = MultiByteToWideChar(code_page, flags,
7245 in, insize,
7246 buffer, Py_ARRAY_LENGTH(buffer));
7247 if (outsize > 0)
7248 break;
7249 err = GetLastError();
7250 if (err != ERROR_NO_UNICODE_TRANSLATION
7251 && err != ERROR_INSUFFICIENT_BUFFER)
7252 {
7253 PyErr_SetFromWindowsErr(0);
7254 goto error;
7255 }
7256 insize++;
7257 }
7258 /* 4=maximum length of a UTF-8 sequence */
7259 while (insize <= 4 && (in + insize) <= endin);
7260
7261 if (outsize <= 0) {
7262 Py_ssize_t startinpos, endinpos, outpos;
7263
Victor Stinner7d00cc12014-03-17 23:08:06 +01007264 /* last character in partial decode? */
7265 if (in + insize >= endin && !final)
7266 break;
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 startinpos = in - startin;
7269 endinpos = startinpos + 1;
7270 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007271 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 errors, &errorHandler,
7273 encoding, reason,
7274 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007275 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 {
7277 goto error;
7278 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007279 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 }
7281 else {
7282 in += insize;
7283 memcpy(out, buffer, outsize * sizeof(wchar_t));
7284 out += outsize;
7285 }
7286 }
7287
7288 /* write a NUL character at the end */
7289 *out = 0;
7290
7291 /* Extend unicode object */
7292 outsize = out - startout;
7293 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007294 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007296 /* (in - startin) <= size and size is an int */
7297 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007298
7299error:
7300 Py_XDECREF(encoding_obj);
7301 Py_XDECREF(errorHandler);
7302 Py_XDECREF(exc);
7303 return ret;
7304}
7305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306static PyObject *
7307decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007308 const char *s, Py_ssize_t size,
7309 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310{
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 PyObject *v = NULL;
7312 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 if (code_page < 0) {
7315 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7316 return NULL;
7317 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007318 if (size < 0) {
7319 PyErr_BadInternalCall();
7320 return NULL;
7321 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 do
7327 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007329 if (size > INT_MAX) {
7330 chunk_size = INT_MAX;
7331 final = 0;
7332 done = 0;
7333 }
7334 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007335#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 {
7337 chunk_size = (int)size;
7338 final = (consumed == NULL);
7339 done = 1;
7340 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 if (chunk_size == 0 && done) {
7343 if (v != NULL)
7344 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007345 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347
Victor Stinner76a31a62011-11-04 00:05:13 +01007348 converted = decode_code_page_strict(code_page, &v,
7349 s, chunk_size);
7350 if (converted == -2)
7351 converted = decode_code_page_errors(code_page, &v,
7352 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007353 errors, final);
7354 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007355
7356 if (converted < 0) {
7357 Py_XDECREF(v);
7358 return NULL;
7359 }
7360
7361 if (consumed)
7362 *consumed += converted;
7363
7364 s += converted;
7365 size -= converted;
7366 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007367
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007368 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369}
7370
Alexander Belopolsky40018472011-02-26 01:02:56 +00007371PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007372PyUnicode_DecodeCodePageStateful(int code_page,
7373 const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7379}
7380
7381PyObject *
7382PyUnicode_DecodeMBCSStateful(const char *s,
7383 Py_ssize_t size,
7384 const char *errors,
7385 Py_ssize_t *consumed)
7386{
7387 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7388}
7389
7390PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007391PyUnicode_DecodeMBCS(const char *s,
7392 Py_ssize_t size,
7393 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007394{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7396}
7397
Victor Stinner3a50e702011-10-18 21:21:00 +02007398static DWORD
7399encode_code_page_flags(UINT code_page, const char *errors)
7400{
7401 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007402 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 }
7404 else if (code_page == CP_UTF7) {
7405 /* CP_UTF7 only supports flags=0 */
7406 return 0;
7407 }
7408 else {
7409 if (errors != NULL && strcmp(errors, "replace") == 0)
7410 return 0;
7411 else
7412 return WC_NO_BEST_FIT_CHARS;
7413 }
7414}
7415
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007416/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 * Encode a Unicode string to a Windows code page into a byte string in strict
7418 * mode.
7419 *
7420 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007421 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007424encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427{
Victor Stinner554f3f02010-06-16 23:33:54 +00007428 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 BOOL *pusedDefaultChar = &usedDefaultChar;
7430 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007431 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 const DWORD flags = encode_code_page_flags(code_page, NULL);
7434 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 /* Create a substring so that we can get the UTF-16 representation
7436 of just the slice under consideration. */
7437 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007442 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007444 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007445
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 substring = PyUnicode_Substring(unicode, offset, offset+len);
7447 if (substring == NULL)
7448 return -1;
7449 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7450 if (p == NULL) {
7451 Py_DECREF(substring);
7452 return -1;
7453 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007454 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007456 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007458 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 NULL, 0,
7460 NULL, pusedDefaultChar);
7461 if (outsize <= 0)
7462 goto error;
7463 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 if (pusedDefaultChar && *pusedDefaultChar) {
7465 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007468
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 if (*outbytes == NULL) {
7473 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477 }
7478 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 const Py_ssize_t n = PyBytes_Size(*outbytes);
7481 if (outsize > PY_SSIZE_T_MAX - n) {
7482 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7487 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491 }
7492
7493 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007495 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 out, outsize,
7497 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (outsize <= 0)
7500 goto error;
7501 if (pusedDefaultChar && *pusedDefaultChar)
7502 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7508 return -2;
7509 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007510 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007511}
7512
Victor Stinner3a50e702011-10-18 21:21:00 +02007513/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007514 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 * error handler.
7516 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007517 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 * -1 on other error.
7519 */
7520static int
7521encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007522 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007523 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007524{
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 Py_ssize_t pos = unicode_offset;
7527 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 /* Ideally, we should get reason from FormatMessage. This is the Windows
7529 2000 English version of the message. */
7530 const char *reason = "invalid character";
7531 /* 4=maximum length of a UTF-8 sequence */
7532 char buffer[4];
7533 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7534 Py_ssize_t outsize;
7535 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 PyObject *errorHandler = NULL;
7537 PyObject *exc = NULL;
7538 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007539 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007540 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 PyObject *rep;
7542 int ret = -1;
7543
7544 assert(insize > 0);
7545
7546 encoding = code_page_name(code_page, &encoding_obj);
7547 if (encoding == NULL)
7548 return -1;
7549
7550 if (errors == NULL || strcmp(errors, "strict") == 0) {
7551 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7552 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007553 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 if (exc != NULL) {
7555 PyCodec_StrictErrors(exc);
7556 Py_DECREF(exc);
7557 }
7558 Py_XDECREF(encoding_obj);
7559 return -1;
7560 }
7561
7562 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7563 pusedDefaultChar = &usedDefaultChar;
7564 else
7565 pusedDefaultChar = NULL;
7566
7567 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7568 PyErr_NoMemory();
7569 goto error;
7570 }
7571 outsize = insize * Py_ARRAY_LENGTH(buffer);
7572
7573 if (*outbytes == NULL) {
7574 /* Create string object */
7575 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7576 if (*outbytes == NULL)
7577 goto error;
7578 out = PyBytes_AS_STRING(*outbytes);
7579 }
7580 else {
7581 /* Extend string object */
7582 Py_ssize_t n = PyBytes_Size(*outbytes);
7583 if (n > PY_SSIZE_T_MAX - outsize) {
7584 PyErr_NoMemory();
7585 goto error;
7586 }
7587 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7588 goto error;
7589 out = PyBytes_AS_STRING(*outbytes) + n;
7590 }
7591
7592 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7596 wchar_t chars[2];
7597 int charsize;
7598 if (ch < 0x10000) {
7599 chars[0] = (wchar_t)ch;
7600 charsize = 1;
7601 }
7602 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007603 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7604 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007605 charsize = 2;
7606 }
7607
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007609 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 buffer, Py_ARRAY_LENGTH(buffer),
7611 NULL, pusedDefaultChar);
7612 if (outsize > 0) {
7613 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7614 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 memcpy(out, buffer, outsize);
7617 out += outsize;
7618 continue;
7619 }
7620 }
7621 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7622 PyErr_SetFromWindowsErr(0);
7623 goto error;
7624 }
7625
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 rep = unicode_encode_call_errorhandler(
7627 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 if (rep == NULL)
7631 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007632 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007633
7634 if (PyBytes_Check(rep)) {
7635 outsize = PyBytes_GET_SIZE(rep);
7636 if (outsize != 1) {
7637 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7638 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7639 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7640 Py_DECREF(rep);
7641 goto error;
7642 }
7643 out = PyBytes_AS_STRING(*outbytes) + offset;
7644 }
7645 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7646 out += outsize;
7647 }
7648 else {
7649 Py_ssize_t i;
7650 enum PyUnicode_Kind kind;
7651 void *data;
7652
Benjamin Petersonbac79492012-01-14 13:34:47 -05007653 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 Py_DECREF(rep);
7655 goto error;
7656 }
7657
7658 outsize = PyUnicode_GET_LENGTH(rep);
7659 if (outsize != 1) {
7660 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7661 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7662 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7663 Py_DECREF(rep);
7664 goto error;
7665 }
7666 out = PyBytes_AS_STRING(*outbytes) + offset;
7667 }
7668 kind = PyUnicode_KIND(rep);
7669 data = PyUnicode_DATA(rep);
7670 for (i=0; i < outsize; i++) {
7671 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7672 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007673 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 encoding, unicode,
7675 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 "unable to encode error handler result to ASCII");
7677 Py_DECREF(rep);
7678 goto error;
7679 }
7680 *out = (unsigned char)ch;
7681 out++;
7682 }
7683 }
7684 Py_DECREF(rep);
7685 }
7686 /* write a NUL byte */
7687 *out = 0;
7688 outsize = out - PyBytes_AS_STRING(*outbytes);
7689 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7690 if (_PyBytes_Resize(outbytes, outsize) < 0)
7691 goto error;
7692 ret = 0;
7693
7694error:
7695 Py_XDECREF(encoding_obj);
7696 Py_XDECREF(errorHandler);
7697 Py_XDECREF(exc);
7698 return ret;
7699}
7700
Victor Stinner3a50e702011-10-18 21:21:00 +02007701static PyObject *
7702encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 const char *errors)
7705{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007707 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007708 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007710
Victor Stinner29dacf22015-01-26 16:41:32 +01007711 if (!PyUnicode_Check(unicode)) {
7712 PyErr_BadArgument();
7713 return NULL;
7714 }
7715
Benjamin Petersonbac79492012-01-14 13:34:47 -05007716 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 return NULL;
7718 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007719
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 if (code_page < 0) {
7721 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7722 return NULL;
7723 }
7724
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 return PyBytes_FromStringAndSize(NULL, 0);
7727
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 offset = 0;
7729 do
7730 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007731#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007732 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 chunks. */
7734 if (len > INT_MAX/2) {
7735 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 done = 0;
7737 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007739#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 done = 1;
7743 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007744
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 errors);
7748 if (ret == -2)
7749 ret = encode_code_page_errors(code_page, &outbytes,
7750 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 if (ret < 0) {
7753 Py_XDECREF(outbytes);
7754 return NULL;
7755 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 return outbytes;
7762}
7763
7764PyObject *
7765PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7766 Py_ssize_t size,
7767 const char *errors)
7768{
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007770 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 if (unicode == NULL)
7772 return NULL;
7773 res = encode_code_page(CP_ACP, unicode, errors);
7774 Py_DECREF(unicode);
7775 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007776}
7777
7778PyObject *
7779PyUnicode_EncodeCodePage(int code_page,
7780 PyObject *unicode,
7781 const char *errors)
7782{
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007784}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007785
Alexander Belopolsky40018472011-02-26 01:02:56 +00007786PyObject *
7787PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007788{
Victor Stinner7581cef2011-11-03 22:32:33 +01007789 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007790}
7791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007792#undef NEED_RETRY
7793
Steve Dowercc16be82016-09-08 10:35:16 -07007794#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796/* --- Character Mapping Codec -------------------------------------------- */
7797
Victor Stinnerfb161b12013-04-18 01:44:27 +02007798static int
7799charmap_decode_string(const char *s,
7800 Py_ssize_t size,
7801 PyObject *mapping,
7802 const char *errors,
7803 _PyUnicodeWriter *writer)
7804{
7805 const char *starts = s;
7806 const char *e;
7807 Py_ssize_t startinpos, endinpos;
7808 PyObject *errorHandler = NULL, *exc = NULL;
7809 Py_ssize_t maplen;
7810 enum PyUnicode_Kind mapkind;
7811 void *mapdata;
7812 Py_UCS4 x;
7813 unsigned char ch;
7814
7815 if (PyUnicode_READY(mapping) == -1)
7816 return -1;
7817
7818 maplen = PyUnicode_GET_LENGTH(mapping);
7819 mapdata = PyUnicode_DATA(mapping);
7820 mapkind = PyUnicode_KIND(mapping);
7821
7822 e = s + size;
7823
7824 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7825 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7826 * is disabled in encoding aliases, latin1 is preferred because
7827 * its implementation is faster. */
7828 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7829 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7830 Py_UCS4 maxchar = writer->maxchar;
7831
7832 assert (writer->kind == PyUnicode_1BYTE_KIND);
7833 while (s < e) {
7834 ch = *s;
7835 x = mapdata_ucs1[ch];
7836 if (x > maxchar) {
7837 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7838 goto onError;
7839 maxchar = writer->maxchar;
7840 outdata = (Py_UCS1 *)writer->data;
7841 }
7842 outdata[writer->pos] = x;
7843 writer->pos++;
7844 ++s;
7845 }
7846 return 0;
7847 }
7848
7849 while (s < e) {
7850 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7851 enum PyUnicode_Kind outkind = writer->kind;
7852 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7853 if (outkind == PyUnicode_1BYTE_KIND) {
7854 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7855 Py_UCS4 maxchar = writer->maxchar;
7856 while (s < e) {
7857 ch = *s;
7858 x = mapdata_ucs2[ch];
7859 if (x > maxchar)
7860 goto Error;
7861 outdata[writer->pos] = x;
7862 writer->pos++;
7863 ++s;
7864 }
7865 break;
7866 }
7867 else if (outkind == PyUnicode_2BYTE_KIND) {
7868 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7869 while (s < e) {
7870 ch = *s;
7871 x = mapdata_ucs2[ch];
7872 if (x == 0xFFFE)
7873 goto Error;
7874 outdata[writer->pos] = x;
7875 writer->pos++;
7876 ++s;
7877 }
7878 break;
7879 }
7880 }
7881 ch = *s;
7882
7883 if (ch < maplen)
7884 x = PyUnicode_READ(mapkind, mapdata, ch);
7885 else
7886 x = 0xfffe; /* invalid value */
7887Error:
7888 if (x == 0xfffe)
7889 {
7890 /* undefined mapping */
7891 startinpos = s-starts;
7892 endinpos = startinpos+1;
7893 if (unicode_decode_call_errorhandler_writer(
7894 errors, &errorHandler,
7895 "charmap", "character maps to <undefined>",
7896 &starts, &e, &startinpos, &endinpos, &exc, &s,
7897 writer)) {
7898 goto onError;
7899 }
7900 continue;
7901 }
7902
7903 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7904 goto onError;
7905 ++s;
7906 }
7907 Py_XDECREF(errorHandler);
7908 Py_XDECREF(exc);
7909 return 0;
7910
7911onError:
7912 Py_XDECREF(errorHandler);
7913 Py_XDECREF(exc);
7914 return -1;
7915}
7916
7917static int
7918charmap_decode_mapping(const char *s,
7919 Py_ssize_t size,
7920 PyObject *mapping,
7921 const char *errors,
7922 _PyUnicodeWriter *writer)
7923{
7924 const char *starts = s;
7925 const char *e;
7926 Py_ssize_t startinpos, endinpos;
7927 PyObject *errorHandler = NULL, *exc = NULL;
7928 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007929 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007930
7931 e = s + size;
7932
7933 while (s < e) {
7934 ch = *s;
7935
7936 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7937 key = PyLong_FromLong((long)ch);
7938 if (key == NULL)
7939 goto onError;
7940
7941 item = PyObject_GetItem(mapping, key);
7942 Py_DECREF(key);
7943 if (item == NULL) {
7944 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7945 /* No mapping found means: mapping is undefined. */
7946 PyErr_Clear();
7947 goto Undefined;
7948 } else
7949 goto onError;
7950 }
7951
7952 /* Apply mapping */
7953 if (item == Py_None)
7954 goto Undefined;
7955 if (PyLong_Check(item)) {
7956 long value = PyLong_AS_LONG(item);
7957 if (value == 0xFFFE)
7958 goto Undefined;
7959 if (value < 0 || value > MAX_UNICODE) {
7960 PyErr_Format(PyExc_TypeError,
7961 "character mapping must be in range(0x%lx)",
7962 (unsigned long)MAX_UNICODE + 1);
7963 goto onError;
7964 }
7965
7966 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7967 goto onError;
7968 }
7969 else if (PyUnicode_Check(item)) {
7970 if (PyUnicode_READY(item) == -1)
7971 goto onError;
7972 if (PyUnicode_GET_LENGTH(item) == 1) {
7973 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7974 if (value == 0xFFFE)
7975 goto Undefined;
7976 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7977 goto onError;
7978 }
7979 else {
7980 writer->overallocate = 1;
7981 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7982 goto onError;
7983 }
7984 }
7985 else {
7986 /* wrong return value */
7987 PyErr_SetString(PyExc_TypeError,
7988 "character mapping must return integer, None or str");
7989 goto onError;
7990 }
7991 Py_CLEAR(item);
7992 ++s;
7993 continue;
7994
7995Undefined:
7996 /* undefined mapping */
7997 Py_CLEAR(item);
7998 startinpos = s-starts;
7999 endinpos = startinpos+1;
8000 if (unicode_decode_call_errorhandler_writer(
8001 errors, &errorHandler,
8002 "charmap", "character maps to <undefined>",
8003 &starts, &e, &startinpos, &endinpos, &exc, &s,
8004 writer)) {
8005 goto onError;
8006 }
8007 }
8008 Py_XDECREF(errorHandler);
8009 Py_XDECREF(exc);
8010 return 0;
8011
8012onError:
8013 Py_XDECREF(item);
8014 Py_XDECREF(errorHandler);
8015 Py_XDECREF(exc);
8016 return -1;
8017}
8018
Alexander Belopolsky40018472011-02-26 01:02:56 +00008019PyObject *
8020PyUnicode_DecodeCharmap(const char *s,
8021 Py_ssize_t size,
8022 PyObject *mapping,
8023 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008025 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008026
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 /* Default to Latin-1 */
8028 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008032 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008033 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008034 writer.min_length = size;
8035 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008038 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008039 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8040 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008041 }
8042 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008043 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8044 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008046 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008047
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008049 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return NULL;
8051}
8052
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053/* Charmap encoding: the lookup table */
8054
Alexander Belopolsky40018472011-02-26 01:02:56 +00008055struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 PyObject_HEAD
8057 unsigned char level1[32];
8058 int count2, count3;
8059 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060};
8061
8062static PyObject*
8063encoding_map_size(PyObject *obj, PyObject* args)
8064{
8065 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068}
8069
8070static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 PyDoc_STR("Return the size (in bytes) of this object") },
8073 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074};
8075
8076static void
8077encoding_map_dealloc(PyObject* o)
8078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080}
8081
8082static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 "EncodingMap", /*tp_name*/
8085 sizeof(struct encoding_map), /*tp_basicsize*/
8086 0, /*tp_itemsize*/
8087 /* methods */
8088 encoding_map_dealloc, /*tp_dealloc*/
8089 0, /*tp_print*/
8090 0, /*tp_getattr*/
8091 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008092 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 0, /*tp_repr*/
8094 0, /*tp_as_number*/
8095 0, /*tp_as_sequence*/
8096 0, /*tp_as_mapping*/
8097 0, /*tp_hash*/
8098 0, /*tp_call*/
8099 0, /*tp_str*/
8100 0, /*tp_getattro*/
8101 0, /*tp_setattro*/
8102 0, /*tp_as_buffer*/
8103 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8104 0, /*tp_doc*/
8105 0, /*tp_traverse*/
8106 0, /*tp_clear*/
8107 0, /*tp_richcompare*/
8108 0, /*tp_weaklistoffset*/
8109 0, /*tp_iter*/
8110 0, /*tp_iternext*/
8111 encoding_map_methods, /*tp_methods*/
8112 0, /*tp_members*/
8113 0, /*tp_getset*/
8114 0, /*tp_base*/
8115 0, /*tp_dict*/
8116 0, /*tp_descr_get*/
8117 0, /*tp_descr_set*/
8118 0, /*tp_dictoffset*/
8119 0, /*tp_init*/
8120 0, /*tp_alloc*/
8121 0, /*tp_new*/
8122 0, /*tp_free*/
8123 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124};
8125
8126PyObject*
8127PyUnicode_BuildEncodingMap(PyObject* string)
8128{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 PyObject *result;
8130 struct encoding_map *mresult;
8131 int i;
8132 int need_dict = 0;
8133 unsigned char level1[32];
8134 unsigned char level2[512];
8135 unsigned char *mlevel1, *mlevel2, *mlevel3;
8136 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 int kind;
8138 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008142 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 PyErr_BadArgument();
8144 return NULL;
8145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 kind = PyUnicode_KIND(string);
8147 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008148 length = PyUnicode_GET_LENGTH(string);
8149 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 memset(level1, 0xFF, sizeof level1);
8151 memset(level2, 0xFF, sizeof level2);
8152
8153 /* If there isn't a one-to-one mapping of NULL to \0,
8154 or if there are non-BMP characters, we need to use
8155 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008158 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 ch = PyUnicode_READ(kind, data, i);
8161 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 need_dict = 1;
8163 break;
8164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 /* unmapped character */
8167 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 l1 = ch >> 11;
8169 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 if (level1[l1] == 0xFF)
8171 level1[l1] = count2++;
8172 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 }
8175
8176 if (count2 >= 0xFF || count3 >= 0xFF)
8177 need_dict = 1;
8178
8179 if (need_dict) {
8180 PyObject *result = PyDict_New();
8181 PyObject *key, *value;
8182 if (!result)
8183 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008184 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008186 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008187 if (!key || !value)
8188 goto failed1;
8189 if (PyDict_SetItem(result, key, value) == -1)
8190 goto failed1;
8191 Py_DECREF(key);
8192 Py_DECREF(value);
8193 }
8194 return result;
8195 failed1:
8196 Py_XDECREF(key);
8197 Py_XDECREF(value);
8198 Py_DECREF(result);
8199 return NULL;
8200 }
8201
8202 /* Create a three-level trie */
8203 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8204 16*count2 + 128*count3 - 1);
8205 if (!result)
8206 return PyErr_NoMemory();
8207 PyObject_Init(result, &EncodingMapType);
8208 mresult = (struct encoding_map*)result;
8209 mresult->count2 = count2;
8210 mresult->count3 = count3;
8211 mlevel1 = mresult->level1;
8212 mlevel2 = mresult->level23;
8213 mlevel3 = mresult->level23 + 16*count2;
8214 memcpy(mlevel1, level1, 32);
8215 memset(mlevel2, 0xFF, 16*count2);
8216 memset(mlevel3, 0, 128*count3);
8217 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008218 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8221 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 /* unmapped character */
8223 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008224 o1 = ch>>11;
8225 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 i2 = 16*mlevel1[o1] + o2;
8227 if (mlevel2[i2] == 0xFF)
8228 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008229 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230 i3 = 128*mlevel2[i2] + o3;
8231 mlevel3[i3] = i;
8232 }
8233 return result;
8234}
8235
8236static int
Victor Stinner22168992011-11-20 17:09:18 +01008237encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238{
8239 struct encoding_map *map = (struct encoding_map*)mapping;
8240 int l1 = c>>11;
8241 int l2 = (c>>7) & 0xF;
8242 int l3 = c & 0x7F;
8243 int i;
8244
Victor Stinner22168992011-11-20 17:09:18 +01008245 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008247 if (c == 0)
8248 return 0;
8249 /* level 1*/
8250 i = map->level1[l1];
8251 if (i == 0xFF) {
8252 return -1;
8253 }
8254 /* level 2*/
8255 i = map->level23[16*i+l2];
8256 if (i == 0xFF) {
8257 return -1;
8258 }
8259 /* level 3 */
8260 i = map->level23[16*map->count2 + 128*i + l3];
8261 if (i == 0) {
8262 return -1;
8263 }
8264 return i;
8265}
8266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267/* Lookup the character ch in the mapping. If the character
8268 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008269 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008271charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272{
Christian Heimes217cfd12007-12-02 14:31:20 +00008273 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 PyObject *x;
8275
8276 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 x = PyObject_GetItem(mapping, w);
8279 Py_DECREF(w);
8280 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8282 /* No mapping found means: mapping is undefined. */
8283 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008284 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 } else
8286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008288 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 long value = PyLong_AS_LONG(x);
8292 if (value < 0 || value > 255) {
8293 PyErr_SetString(PyExc_TypeError,
8294 "character mapping must be in range(256)");
8295 Py_DECREF(x);
8296 return NULL;
8297 }
8298 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008300 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 /* wrong return value */
8304 PyErr_Format(PyExc_TypeError,
8305 "character mapping must return integer, bytes or None, not %.400s",
8306 x->ob_type->tp_name);
8307 Py_DECREF(x);
8308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 }
8310}
8311
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008313charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8316 /* exponentially overallocate to minimize reallocations */
8317 if (requiredsize < 2*outsize)
8318 requiredsize = 2*outsize;
8319 if (_PyBytes_Resize(outobj, requiredsize))
8320 return -1;
8321 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322}
8323
Benjamin Peterson14339b62009-01-31 16:36:08 +00008324typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008328 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 space is available. Return a new reference to the object that
8330 was put in the output buffer, or Py_None, if the mapping was undefined
8331 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008332 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008334charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008335 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 PyObject *rep;
8338 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008339 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340
Christian Heimes90aa7642007-12-19 02:45:37 +00008341 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 if (res == -1)
8345 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 if (outsize<requiredsize)
8347 if (charmapencode_resize(outobj, outpos, requiredsize))
8348 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008349 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 outstart[(*outpos)++] = (char)res;
8351 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 }
8353
8354 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 Py_DECREF(rep);
8359 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008360 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 if (PyLong_Check(rep)) {
8362 Py_ssize_t requiredsize = *outpos+1;
8363 if (outsize<requiredsize)
8364 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8365 Py_DECREF(rep);
8366 return enc_EXCEPTION;
8367 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008368 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 else {
8372 const char *repchars = PyBytes_AS_STRING(rep);
8373 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8374 Py_ssize_t requiredsize = *outpos+repsize;
8375 if (outsize<requiredsize)
8376 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8377 Py_DECREF(rep);
8378 return enc_EXCEPTION;
8379 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008380 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 memcpy(outstart + *outpos, repchars, repsize);
8382 *outpos += repsize;
8383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 Py_DECREF(rep);
8386 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387}
8388
8389/* handle an error in PyUnicode_EncodeCharmap
8390 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391static int
8392charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008393 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008395 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008396 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397{
8398 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008401 enum PyUnicode_Kind kind;
8402 void *data;
8403 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 Py_ssize_t collstartpos = *inpos;
8406 Py_ssize_t collendpos = *inpos+1;
8407 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008408 const char *encoding = "charmap";
8409 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008412 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413
Benjamin Petersonbac79492012-01-14 13:34:47 -05008414 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 return -1;
8416 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 /* find all unencodable characters */
8418 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008420 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008421 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008422 val = encoding_map_lookup(ch, mapping);
8423 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 break;
8425 ++collendpos;
8426 continue;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008429 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8430 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 if (rep==NULL)
8432 return -1;
8433 else if (rep!=Py_None) {
8434 Py_DECREF(rep);
8435 break;
8436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 }
8440 /* cache callback name lookup
8441 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008442 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008443 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008444
8445 switch (*error_handler) {
8446 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008449
8450 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 x = charmapencode_output('?', mapping, res, respos);
8453 if (x==enc_EXCEPTION) {
8454 return -1;
8455 }
8456 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008457 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
8459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 }
8461 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008462 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 *inpos = collendpos;
8464 break;
Victor Stinner50149202015-09-22 00:26:54 +02008465
8466 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 /* generate replacement (temporarily (mis)uses p) */
8468 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 char buffer[2+29+1+1];
8470 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008471 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 for (cp = buffer; *cp; ++cp) {
8473 x = charmapencode_output(*cp, mapping, res, respos);
8474 if (x==enc_EXCEPTION)
8475 return -1;
8476 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008477 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 }
8481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 *inpos = collendpos;
8483 break;
Victor Stinner50149202015-09-22 00:26:54 +02008484
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 default:
Victor Stinner50149202015-09-22 00:26:54 +02008486 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008487 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008491 if (PyBytes_Check(repunicode)) {
8492 /* Directly copy bytes result to output. */
8493 Py_ssize_t outsize = PyBytes_Size(*res);
8494 Py_ssize_t requiredsize;
8495 repsize = PyBytes_Size(repunicode);
8496 requiredsize = *respos + repsize;
8497 if (requiredsize > outsize)
8498 /* Make room for all additional bytes. */
8499 if (charmapencode_resize(res, respos, requiredsize)) {
8500 Py_DECREF(repunicode);
8501 return -1;
8502 }
8503 memcpy(PyBytes_AsString(*res) + *respos,
8504 PyBytes_AsString(repunicode), repsize);
8505 *respos += repsize;
8506 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008507 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008508 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008511 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 Py_DECREF(repunicode);
8513 return -1;
8514 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008515 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008516 data = PyUnicode_DATA(repunicode);
8517 kind = PyUnicode_KIND(repunicode);
8518 for (index = 0; index < repsize; index++) {
8519 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8520 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008522 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
8525 else if (x==enc_FAILED) {
8526 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008527 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return -1;
8529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 }
8531 *inpos = newpos;
8532 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 }
8534 return 0;
8535}
8536
Alexander Belopolsky40018472011-02-26 01:02:56 +00008537PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538_PyUnicode_EncodeCharmap(PyObject *unicode,
8539 PyObject *mapping,
8540 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 /* output object */
8543 PyObject *res = NULL;
8544 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008545 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008548 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008549 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008551 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008552 void *data;
8553 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Benjamin Petersonbac79492012-01-14 13:34:47 -05008555 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 return NULL;
8557 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008558 data = PyUnicode_DATA(unicode);
8559 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 /* Default to Latin-1 */
8562 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008563 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 /* allocate enough for a simple encoding without
8566 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008567 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 if (res == NULL)
8569 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008570 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008574 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 if (x==enc_EXCEPTION) /* error */
8578 goto onError;
8579 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008582 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 &res, &respos)) {
8584 goto onError;
8585 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 else
8588 /* done with this character => adjust input position */
8589 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008593 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008594 if (_PyBytes_Resize(&res, respos) < 0)
8595 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008598 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 return res;
8600
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 Py_XDECREF(res);
8603 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008604 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 return NULL;
8606}
8607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608/* Deprecated */
8609PyObject *
8610PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8611 Py_ssize_t size,
8612 PyObject *mapping,
8613 const char *errors)
8614{
8615 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008616 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617 if (unicode == NULL)
8618 return NULL;
8619 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8620 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008621 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622}
8623
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624PyObject *
8625PyUnicode_AsCharmapString(PyObject *unicode,
8626 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627{
8628 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 PyErr_BadArgument();
8630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008632 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633}
8634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static void
8637make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639 Py_ssize_t startpos, Py_ssize_t endpos,
8640 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 *exceptionObject = _PyUnicodeTranslateError_Create(
8644 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 }
8646 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8648 goto onError;
8649 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8650 goto onError;
8651 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8652 goto onError;
8653 return;
8654 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008655 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
8657}
8658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659/* error handling callback helper:
8660 build arguments, call the callback and check the arguments,
8661 put the result into newpos and return the replacement string, which
8662 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663static PyObject *
8664unicode_translate_call_errorhandler(const char *errors,
8665 PyObject **errorHandler,
8666 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008668 Py_ssize_t startpos, Py_ssize_t endpos,
8669 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008671 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008673 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 PyObject *restuple;
8675 PyObject *resunicode;
8676
8677 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 }
8682
8683 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008688 restuple = PyObject_CallFunctionObjArgs(
8689 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008693 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008697 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 &resunicode, &i_newpos)) {
8699 Py_DECREF(restuple);
8700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008702 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008704 else
8705 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008707 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 Py_DECREF(restuple);
8709 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 Py_INCREF(resunicode);
8712 Py_DECREF(restuple);
8713 return resunicode;
8714}
8715
8716/* Lookup the character ch in the mapping and put the result in result,
8717 which must be decrefed by the caller.
8718 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008719static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721{
Christian Heimes217cfd12007-12-02 14:31:20 +00008722 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 PyObject *x;
8724
8725 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 x = PyObject_GetItem(mapping, w);
8728 Py_DECREF(w);
8729 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8731 /* No mapping found means: use 1:1 mapping. */
8732 PyErr_Clear();
8733 *result = NULL;
8734 return 0;
8735 } else
8736 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
8738 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 *result = x;
8740 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008742 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008744 if (value < 0 || value > MAX_UNICODE) {
8745 PyErr_Format(PyExc_ValueError,
8746 "character mapping must be in range(0x%x)",
8747 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 Py_DECREF(x);
8749 return -1;
8750 }
8751 *result = x;
8752 return 0;
8753 }
8754 else if (PyUnicode_Check(x)) {
8755 *result = x;
8756 return 0;
8757 }
8758 else {
8759 /* wrong return value */
8760 PyErr_SetString(PyExc_TypeError,
8761 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008762 Py_DECREF(x);
8763 return -1;
8764 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765}
Victor Stinner1194ea02014-04-04 19:37:40 +02008766
8767/* lookup the character, write the result into the writer.
8768 Return 1 if the result was written into the writer, return 0 if the mapping
8769 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008771charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8772 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773{
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 PyObject *item;
8775
8776 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008778
8779 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008781 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008784 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008786
8787 if (item == Py_None) {
8788 Py_DECREF(item);
8789 return 0;
8790 }
8791
8792 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008793 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8794 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8795 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008796 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8797 Py_DECREF(item);
8798 return -1;
8799 }
8800 Py_DECREF(item);
8801 return 1;
8802 }
8803
8804 if (!PyUnicode_Check(item)) {
8805 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008807 }
8808
8809 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8810 Py_DECREF(item);
8811 return -1;
8812 }
8813
8814 Py_DECREF(item);
8815 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816}
8817
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818static int
8819unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8820 Py_UCS1 *translate)
8821{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 int ret = 0;
8824
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 if (charmaptranslate_lookup(ch, mapping, &item)) {
8826 return -1;
8827 }
8828
8829 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008831 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008833 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 /* not found => default to 1:1 mapping */
8835 translate[ch] = ch;
8836 return 1;
8837 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008838 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008839 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008840 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8841 used it */
8842 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 /* invalid character or character outside ASCII:
8844 skip the fast translate */
8845 goto exit;
8846 }
8847 translate[ch] = (Py_UCS1)replace;
8848 }
8849 else if (PyUnicode_Check(item)) {
8850 Py_UCS4 replace;
8851
8852 if (PyUnicode_READY(item) == -1) {
8853 Py_DECREF(item);
8854 return -1;
8855 }
8856 if (PyUnicode_GET_LENGTH(item) != 1)
8857 goto exit;
8858
8859 replace = PyUnicode_READ_CHAR(item, 0);
8860 if (replace > 127)
8861 goto exit;
8862 translate[ch] = (Py_UCS1)replace;
8863 }
8864 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 goto exit;
8867 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 ret = 1;
8869
Benjamin Peterson1365de72014-04-07 20:15:41 -04008870 exit:
8871 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 return ret;
8873}
8874
8875/* Fast path for ascii => ascii translation. Return 1 if the whole string
8876 was translated into writer, return 0 if the input string was partially
8877 translated into writer, raise an exception and return -1 on error. */
8878static int
8879unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008880 _PyUnicodeWriter *writer, int ignore,
8881 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882{
Victor Stinner872b2912014-04-05 14:27:07 +02008883 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 Py_ssize_t len;
8885 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008886 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 len = PyUnicode_GET_LENGTH(input);
8889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891
8892 in = PyUnicode_1BYTE_DATA(input);
8893 end = in + len;
8894
8895 assert(PyUnicode_IS_ASCII(writer->buffer));
8896 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8897 out = PyUnicode_1BYTE_DATA(writer->buffer);
8898
Victor Stinner872b2912014-04-05 14:27:07 +02008899 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008901 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008903 int translate = unicode_fast_translate_lookup(mapping, ch,
8904 ascii_table);
8905 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008907 if (translate == 0)
8908 goto exit;
8909 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 }
Victor Stinner872b2912014-04-05 14:27:07 +02008911 if (ch2 == 0xfe) {
8912 if (ignore)
8913 continue;
8914 goto exit;
8915 }
8916 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008918 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919 }
Victor Stinner872b2912014-04-05 14:27:07 +02008920 res = 1;
8921
8922exit:
8923 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008924 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008925 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926}
8927
Victor Stinner3222da22015-10-01 22:07:32 +02008928static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929_PyUnicode_TranslateCharmap(PyObject *input,
8930 PyObject *mapping,
8931 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 Py_ssize_t size, i;
8936 int kind;
8937 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 _PyUnicodeWriter writer;
8939 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008940 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 PyObject *errorHandler = NULL;
8942 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 PyErr_BadArgument();
8948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 if (PyUnicode_READY(input) == -1)
8952 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 kind = PyUnicode_KIND(input);
8955 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008957 if (size == 0)
8958 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008960 /* allocate enough for a simple 1:1 translation without
8961 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 _PyUnicodeWriter_Init(&writer);
8963 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Victor Stinner872b2912014-04-05 14:27:07 +02008966 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8967
Victor Stinner33798672016-03-01 21:59:58 +01008968 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008970 if (PyUnicode_IS_ASCII(input)) {
8971 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8972 if (res < 0) {
8973 _PyUnicodeWriter_Dealloc(&writer);
8974 return NULL;
8975 }
8976 if (res == 1)
8977 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 }
Victor Stinner33798672016-03-01 21:59:58 +01008979 else {
8980 i = 0;
8981 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 int translate;
8986 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8987 Py_ssize_t newpos;
8988 /* startpos for collecting untranslatable chars */
8989 Py_ssize_t collstart;
8990 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Victor Stinner1194ea02014-04-04 19:37:40 +02008993 ch = PyUnicode_READ(kind, data, i);
8994 translate = charmaptranslate_output(ch, mapping, &writer);
8995 if (translate < 0)
8996 goto onError;
8997
8998 if (translate != 0) {
8999 /* it worked => adjust input pointer */
9000 ++i;
9001 continue;
9002 }
9003
9004 /* untranslatable character */
9005 collstart = i;
9006 collend = i+1;
9007
9008 /* find all untranslatable characters */
9009 while (collend < size) {
9010 PyObject *x;
9011 ch = PyUnicode_READ(kind, data, collend);
9012 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009013 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 Py_XDECREF(x);
9015 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 ++collend;
9018 }
9019
9020 if (ignore) {
9021 i = collend;
9022 }
9023 else {
9024 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9025 reason, input, &exc,
9026 collstart, collend, &newpos);
9027 if (repunicode == NULL)
9028 goto onError;
9029 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009032 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 Py_DECREF(repunicode);
9034 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009035 }
9036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009037 Py_XDECREF(exc);
9038 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009039 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009042 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043 Py_XDECREF(exc);
9044 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 return NULL;
9046}
9047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048/* Deprecated. Use PyUnicode_Translate instead. */
9049PyObject *
9050PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9051 Py_ssize_t size,
9052 PyObject *mapping,
9053 const char *errors)
9054{
Christian Heimes5f520f42012-09-11 14:03:25 +02009055 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009056 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (!unicode)
9058 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009059 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9060 Py_DECREF(unicode);
9061 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062}
9063
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064PyObject *
9065PyUnicode_Translate(PyObject *str,
9066 PyObject *mapping,
9067 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009069 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009070 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009071 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
Tim Petersced69f82003-09-16 20:30:58 +00009073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074PyObject *
9075_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9076{
9077 if (!PyUnicode_Check(unicode)) {
9078 PyErr_BadInternalCall();
9079 return NULL;
9080 }
9081 if (PyUnicode_READY(unicode) == -1)
9082 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009083 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 /* If the string is already ASCII, just return the same string */
9085 Py_INCREF(unicode);
9086 return unicode;
9087 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009088
9089 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9090 PyObject *result = PyUnicode_New(len, 127);
9091 if (result == NULL) {
9092 return NULL;
9093 }
9094
9095 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9096 int kind = PyUnicode_KIND(unicode);
9097 const void *data = PyUnicode_DATA(unicode);
9098 Py_ssize_t i;
9099 for (i = 0; i < len; ++i) {
9100 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9101 if (ch < 127) {
9102 out[i] = ch;
9103 }
9104 else if (Py_UNICODE_ISSPACE(ch)) {
9105 out[i] = ' ';
9106 }
9107 else {
9108 int decimal = Py_UNICODE_TODECIMAL(ch);
9109 if (decimal < 0) {
9110 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009111 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009112 _PyUnicode_LENGTH(result) = i + 1;
9113 break;
9114 }
9115 out[i] = '0' + decimal;
9116 }
9117 }
9118
INADA Naoki16dfca42018-07-14 12:06:43 +09009119 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009120 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121}
9122
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123PyObject *
9124PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9125 Py_ssize_t length)
9126{
Victor Stinnerf0124502011-11-21 23:12:56 +01009127 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009128 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009129 Py_UCS4 maxchar;
9130 enum PyUnicode_Kind kind;
9131 void *data;
9132
Victor Stinner99d7ad02012-02-22 13:37:39 +01009133 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009134 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009135 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 if (ch > 127) {
9137 int decimal = Py_UNICODE_TODECIMAL(ch);
9138 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009139 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009140 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009141 }
9142 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009143
9144 /* Copy to a new string */
9145 decimal = PyUnicode_New(length, maxchar);
9146 if (decimal == NULL)
9147 return decimal;
9148 kind = PyUnicode_KIND(decimal);
9149 data = PyUnicode_DATA(decimal);
9150 /* Iterate over code points */
9151 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009152 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009153 if (ch > 127) {
9154 int decimal = Py_UNICODE_TODECIMAL(ch);
9155 if (decimal >= 0)
9156 ch = '0' + decimal;
9157 }
9158 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009160 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009162/* --- Decimal Encoder ---------------------------------------------------- */
9163
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164int
9165PyUnicode_EncodeDecimal(Py_UNICODE *s,
9166 Py_ssize_t length,
9167 char *output,
9168 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009170 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009171 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009172 enum PyUnicode_Kind kind;
9173 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174
9175 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 PyErr_BadArgument();
9177 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009178 }
9179
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009180 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009181 if (unicode == NULL)
9182 return -1;
9183
Victor Stinner42bf7752011-11-21 22:52:58 +01009184 kind = PyUnicode_KIND(unicode);
9185 data = PyUnicode_DATA(unicode);
9186
Victor Stinnerb84d7232011-11-22 01:50:07 +01009187 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009188 PyObject *exc;
9189 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009191 Py_ssize_t startpos;
9192
9193 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009194
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009197 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 decimal = Py_UNICODE_TODECIMAL(ch);
9201 if (decimal >= 0) {
9202 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
9206 if (0 < ch && ch < 256) {
9207 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009208 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009209 continue;
9210 }
Victor Stinner6345be92011-11-25 20:09:01 +01009211
Victor Stinner42bf7752011-11-21 22:52:58 +01009212 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009213 exc = NULL;
9214 raise_encode_exception(&exc, "decimal", unicode,
9215 startpos, startpos+1,
9216 "invalid decimal Unicode string");
9217 Py_XDECREF(exc);
9218 Py_DECREF(unicode);
9219 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 }
9221 /* 0-terminate the output string */
9222 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009223 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009224 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009225}
9226
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227/* --- Helpers ------------------------------------------------------------ */
9228
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009229/* helper macro to fixup start/end slice values */
9230#define ADJUST_INDICES(start, end, len) \
9231 if (end > len) \
9232 end = len; \
9233 else if (end < 0) { \
9234 end += len; \
9235 if (end < 0) \
9236 end = 0; \
9237 } \
9238 if (start < 0) { \
9239 start += len; \
9240 if (start < 0) \
9241 start = 0; \
9242 }
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009245any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009247 Py_ssize_t end,
9248 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009250 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 void *buf1, *buf2;
9252 Py_ssize_t len1, len2, result;
9253
9254 kind1 = PyUnicode_KIND(s1);
9255 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 if (kind1 < kind2)
9257 return -1;
9258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 len1 = PyUnicode_GET_LENGTH(s1);
9260 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009261 ADJUST_INDICES(start, end, len1);
9262 if (end - start < len2)
9263 return -1;
9264
9265 buf1 = PyUnicode_DATA(s1);
9266 buf2 = PyUnicode_DATA(s2);
9267 if (len2 == 1) {
9268 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9269 result = findchar((const char *)buf1 + kind1*start,
9270 kind1, end - start, ch, direction);
9271 if (result == -1)
9272 return -1;
9273 else
9274 return start + result;
9275 }
9276
9277 if (kind2 != kind1) {
9278 buf2 = _PyUnicode_AsKind(s2, kind1);
9279 if (!buf2)
9280 return -2;
9281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282
Victor Stinner794d5672011-10-10 03:21:36 +02009283 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009284 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009285 case PyUnicode_1BYTE_KIND:
9286 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9287 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9288 else
9289 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 case PyUnicode_2BYTE_KIND:
9292 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9293 break;
9294 case PyUnicode_4BYTE_KIND:
9295 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9296 break;
9297 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009298 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009299 }
9300 }
9301 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009302 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009303 case PyUnicode_1BYTE_KIND:
9304 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9305 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 else
9307 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 case PyUnicode_2BYTE_KIND:
9310 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311 break;
9312 case PyUnicode_4BYTE_KIND:
9313 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9314 break;
9315 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009316 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 }
9319
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009320 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 PyMem_Free(buf2);
9322
9323 return result;
9324}
9325
9326Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009327_PyUnicode_InsertThousandsGrouping(
9328 PyObject *unicode, Py_ssize_t index,
9329 Py_ssize_t n_buffer,
9330 void *digits, Py_ssize_t n_digits,
9331 Py_ssize_t min_width,
9332 const char *grouping, PyObject *thousands_sep,
9333 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334{
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009336 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 Py_ssize_t thousands_sep_len;
9338 Py_ssize_t len;
9339
9340 if (unicode != NULL) {
9341 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009342 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009343 }
9344 else {
9345 kind = PyUnicode_1BYTE_KIND;
9346 data = NULL;
9347 }
9348 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9349 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9350 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9351 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009352 if (thousands_sep_kind < kind) {
9353 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9354 if (!thousands_sep_data)
9355 return -1;
9356 }
9357 else {
9358 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9359 if (!data)
9360 return -1;
9361 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 }
9363
Benjamin Petersonead6b532011-12-20 17:23:42 -06009364 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009368 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009371 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009373 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009379 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009385 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009387 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 break;
9389 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009390 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009392 if (unicode != NULL && thousands_sep_kind != kind) {
9393 if (thousands_sep_kind < kind)
9394 PyMem_Free(thousands_sep_data);
9395 else
9396 PyMem_Free(data);
9397 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 if (unicode == NULL) {
9399 *maxchar = 127;
9400 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009401 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009402 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009403 }
9404 }
9405 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406}
9407
9408
Alexander Belopolsky40018472011-02-26 01:02:56 +00009409Py_ssize_t
9410PyUnicode_Count(PyObject *str,
9411 PyObject *substr,
9412 Py_ssize_t start,
9413 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009415 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 void *buf1 = NULL, *buf2 = NULL;
9418 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 kind1 = PyUnicode_KIND(str);
9424 kind2 = PyUnicode_KIND(substr);
9425 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 len1 = PyUnicode_GET_LENGTH(str);
9429 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf1 = PyUnicode_DATA(str);
9435 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009436 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009437 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438 if (!buf2)
9439 goto onError;
9440 }
9441
9442 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009445 result = asciilib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
9449 else
9450 result = ucs1lib_count(
9451 ((Py_UCS1*)buf1) + start, end - start,
9452 buf2, len2, PY_SSIZE_T_MAX
9453 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 break;
9455 case PyUnicode_2BYTE_KIND:
9456 result = ucs2lib_count(
9457 ((Py_UCS2*)buf1) + start, end - start,
9458 buf2, len2, PY_SSIZE_T_MAX
9459 );
9460 break;
9461 case PyUnicode_4BYTE_KIND:
9462 result = ucs4lib_count(
9463 ((Py_UCS4*)buf1) + start, end - start,
9464 buf2, len2, PY_SSIZE_T_MAX
9465 );
9466 break;
9467 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009468 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009470
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009476 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 PyMem_Free(buf2);
9478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481Py_ssize_t
9482PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009484 Py_ssize_t start,
9485 Py_ssize_t end,
9486 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009490
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009491 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494Py_ssize_t
9495PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9496 Py_ssize_t start, Py_ssize_t end,
9497 int direction)
9498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009500 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 if (PyUnicode_READY(str) == -1)
9502 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009503 len = PyUnicode_GET_LENGTH(str);
9504 ADJUST_INDICES(start, end, len);
9505 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9509 kind, end-start, ch, direction);
9510 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009512 else
9513 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009517tailmatch(PyObject *self,
9518 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519 Py_ssize_t start,
9520 Py_ssize_t end,
9521 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 int kind_self;
9524 int kind_sub;
9525 void *data_self;
9526 void *data_sub;
9527 Py_ssize_t offset;
9528 Py_ssize_t i;
9529 Py_ssize_t end_sub;
9530
9531 if (PyUnicode_READY(self) == -1 ||
9532 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9536 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009540 if (PyUnicode_GET_LENGTH(substring) == 0)
9541 return 1;
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind_self = PyUnicode_KIND(self);
9544 data_self = PyUnicode_DATA(self);
9545 kind_sub = PyUnicode_KIND(substring);
9546 data_sub = PyUnicode_DATA(substring);
9547 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9548
9549 if (direction > 0)
9550 offset = end;
9551 else
9552 offset = start;
9553
9554 if (PyUnicode_READ(kind_self, data_self, offset) ==
9555 PyUnicode_READ(kind_sub, data_sub, 0) &&
9556 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9557 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9558 /* If both are of the same kind, memcmp is sufficient */
9559 if (kind_self == kind_sub) {
9560 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 data_sub,
9563 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009566 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 else {
9568 /* We do not need to compare 0 and len(substring)-1 because
9569 the if statement above ensured already that they are equal
9570 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 for (i = 1; i < end_sub; ++i) {
9572 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9573 PyUnicode_READ(kind_sub, data_sub, i))
9574 return 0;
9575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return 0;
9581}
9582
Alexander Belopolsky40018472011-02-26 01:02:56 +00009583Py_ssize_t
9584PyUnicode_Tailmatch(PyObject *str,
9585 PyObject *substr,
9586 Py_ssize_t start,
9587 Py_ssize_t end,
9588 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009596static PyObject *
9597ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009599 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9600 char *resdata, *data = PyUnicode_DATA(self);
9601 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009602
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009603 res = PyUnicode_New(len, 127);
9604 if (res == NULL)
9605 return NULL;
9606 resdata = PyUnicode_DATA(res);
9607 if (lower)
9608 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610 _Py_bytes_upper(resdata, data, len);
9611 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612}
9613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617 Py_ssize_t j;
9618 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009619 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009621
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009622 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9623
9624 where ! is a negation and \p{xxx} is a character with property xxx.
9625 */
9626 for (j = i - 1; j >= 0; j--) {
9627 c = PyUnicode_READ(kind, data, j);
9628 if (!_PyUnicode_IsCaseIgnorable(c))
9629 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9632 if (final_sigma) {
9633 for (j = i + 1; j < length; j++) {
9634 c = PyUnicode_READ(kind, data, j);
9635 if (!_PyUnicode_IsCaseIgnorable(c))
9636 break;
9637 }
9638 final_sigma = j == length || !_PyUnicode_IsCased(c);
9639 }
9640 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643static int
9644lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9645 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647 /* Obscure special case. */
9648 if (c == 0x3A3) {
9649 mapped[0] = handle_capital_sigma(kind, data, length, i);
9650 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655static Py_ssize_t
9656do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t i, k = 0;
9659 int n_res, j;
9660 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 c = PyUnicode_READ(kind, data, 0);
9663 n_res = _PyUnicode_ToUpperFull(c, mapped);
9664 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009665 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 for (i = 1; i < length; i++) {
9669 c = PyUnicode_READ(kind, data, i);
9670 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9671 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009672 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009674 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009675 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static Py_ssize_t
9680do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9681 Py_ssize_t i, k = 0;
9682
9683 for (i = 0; i < length; i++) {
9684 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9685 int n_res, j;
9686 if (Py_UNICODE_ISUPPER(c)) {
9687 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9688 }
9689 else if (Py_UNICODE_ISLOWER(c)) {
9690 n_res = _PyUnicode_ToUpperFull(c, mapped);
9691 }
9692 else {
9693 n_res = 1;
9694 mapped[0] = c;
9695 }
9696 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009697 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 res[k++] = mapped[j];
9699 }
9700 }
9701 return k;
9702}
9703
9704static Py_ssize_t
9705do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9706 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 Py_ssize_t i, k = 0;
9709
9710 for (i = 0; i < length; i++) {
9711 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9712 int n_res, j;
9713 if (lower)
9714 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9715 else
9716 n_res = _PyUnicode_ToUpperFull(c, mapped);
9717 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009718 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 res[k++] = mapped[j];
9720 }
9721 }
9722 return k;
9723}
9724
9725static Py_ssize_t
9726do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9727{
9728 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9729}
9730
9731static Py_ssize_t
9732do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9733{
9734 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9735}
9736
Benjamin Petersone51757f2012-01-12 21:10:29 -05009737static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009738do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9739{
9740 Py_ssize_t i, k = 0;
9741
9742 for (i = 0; i < length; i++) {
9743 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9744 Py_UCS4 mapped[3];
9745 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9746 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009747 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009748 res[k++] = mapped[j];
9749 }
9750 }
9751 return k;
9752}
9753
9754static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009755do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9756{
9757 Py_ssize_t i, k = 0;
9758 int previous_is_cased;
9759
9760 previous_is_cased = 0;
9761 for (i = 0; i < length; i++) {
9762 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9763 Py_UCS4 mapped[3];
9764 int n_res, j;
9765
9766 if (previous_is_cased)
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 else
9769 n_res = _PyUnicode_ToTitleFull(c, mapped);
9770
9771 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009772 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009773 res[k++] = mapped[j];
9774 }
9775
9776 previous_is_cased = _PyUnicode_IsCased(c);
9777 }
9778 return k;
9779}
9780
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009781static PyObject *
9782case_operation(PyObject *self,
9783 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9784{
9785 PyObject *res = NULL;
9786 Py_ssize_t length, newlength = 0;
9787 int kind, outkind;
9788 void *data, *outdata;
9789 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9790
Benjamin Petersoneea48462012-01-16 14:28:50 -05009791 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009792
9793 kind = PyUnicode_KIND(self);
9794 data = PyUnicode_DATA(self);
9795 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009796 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009797 PyErr_SetString(PyExc_OverflowError, "string is too long");
9798 return NULL;
9799 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009800 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009801 if (tmp == NULL)
9802 return PyErr_NoMemory();
9803 newlength = perform(kind, data, length, tmp, &maxchar);
9804 res = PyUnicode_New(newlength, maxchar);
9805 if (res == NULL)
9806 goto leave;
9807 tmpend = tmp + newlength;
9808 outdata = PyUnicode_DATA(res);
9809 outkind = PyUnicode_KIND(res);
9810 switch (outkind) {
9811 case PyUnicode_1BYTE_KIND:
9812 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9813 break;
9814 case PyUnicode_2BYTE_KIND:
9815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9816 break;
9817 case PyUnicode_4BYTE_KIND:
9818 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9819 break;
9820 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009821 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 }
9823 leave:
9824 PyMem_FREE(tmp);
9825 return res;
9826}
9827
Tim Peters8ce9f162004-08-27 01:49:32 +00009828PyObject *
9829PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009831 PyObject *res;
9832 PyObject *fseq;
9833 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009834 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009836 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009837 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009839 }
9840
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009841 /* NOTE: the following code can't call back into Python code,
9842 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009843 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009844
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009845 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009847 res = _PyUnicode_JoinArray(separator, items, seqlen);
9848 Py_DECREF(fseq);
9849 return res;
9850}
9851
9852PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009853_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009854{
9855 PyObject *res = NULL; /* the result */
9856 PyObject *sep = NULL;
9857 Py_ssize_t seplen;
9858 PyObject *item;
9859 Py_ssize_t sz, i, res_offset;
9860 Py_UCS4 maxchar;
9861 Py_UCS4 item_maxchar;
9862 int use_memcpy;
9863 unsigned char *res_data = NULL, *sep_data = NULL;
9864 PyObject *last_obj;
9865 unsigned int kind = 0;
9866
Tim Peters05eba1f2004-08-27 21:32:02 +00009867 /* If empty sequence, return u"". */
9868 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009869 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009871
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009873 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009874 if (seqlen == 1) {
9875 if (PyUnicode_CheckExact(items[0])) {
9876 res = items[0];
9877 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009878 return res;
9879 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009880 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009881 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009882 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 /* Set up sep and seplen */
9885 if (separator == NULL) {
9886 /* fall back to a blank space separator */
9887 sep = PyUnicode_FromOrdinal(' ');
9888 if (!sep)
9889 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009890 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009891 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009893 else {
9894 if (!PyUnicode_Check(separator)) {
9895 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009896 "separator: expected str instance,"
9897 " %.80s found",
9898 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009899 goto onError;
9900 }
9901 if (PyUnicode_READY(separator))
9902 goto onError;
9903 sep = separator;
9904 seplen = PyUnicode_GET_LENGTH(separator);
9905 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9906 /* inc refcount to keep this code path symmetric with the
9907 above case of a blank separator */
9908 Py_INCREF(sep);
9909 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009910 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 }
9912
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009913 /* There are at least two things to join, or else we have a subclass
9914 * of str in the sequence.
9915 * Do a pre-pass to figure out the total amount of space we'll
9916 * need (sz), and see whether all argument are strings.
9917 */
9918 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009919#ifdef Py_DEBUG
9920 use_memcpy = 0;
9921#else
9922 use_memcpy = 1;
9923#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009925 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009926 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 if (!PyUnicode_Check(item)) {
9928 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009929 "sequence item %zd: expected str instance,"
9930 " %.80s found",
9931 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 goto onError;
9933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (PyUnicode_READY(item) == -1)
9935 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009936 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009938 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009939 if (i != 0) {
9940 add_sz += seplen;
9941 }
9942 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009943 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009945 goto onError;
9946 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009947 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948 if (use_memcpy && last_obj != NULL) {
9949 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9950 use_memcpy = 0;
9951 }
9952 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 }
Tim Petersced69f82003-09-16 20:30:58 +00009954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 if (res == NULL)
9957 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009958
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009959 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009960#ifdef Py_DEBUG
9961 use_memcpy = 0;
9962#else
9963 if (use_memcpy) {
9964 res_data = PyUnicode_1BYTE_DATA(res);
9965 kind = PyUnicode_KIND(res);
9966 if (seplen != 0)
9967 sep_data = PyUnicode_1BYTE_DATA(sep);
9968 }
9969#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009970 if (use_memcpy) {
9971 for (i = 0; i < seqlen; ++i) {
9972 Py_ssize_t itemlen;
9973 item = items[i];
9974
9975 /* Copy item, and maybe the separator. */
9976 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009977 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009979 kind * seplen);
9980 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009981 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009982
9983 itemlen = PyUnicode_GET_LENGTH(item);
9984 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009985 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009986 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009987 kind * itemlen);
9988 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009989 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009990 }
9991 assert(res_data == PyUnicode_1BYTE_DATA(res)
9992 + kind * PyUnicode_GET_LENGTH(res));
9993 }
9994 else {
9995 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9996 Py_ssize_t itemlen;
9997 item = items[i];
9998
9999 /* Copy item, and maybe the separator. */
10000 if (i && seplen != 0) {
10001 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10002 res_offset += seplen;
10003 }
10004
10005 itemlen = PyUnicode_GET_LENGTH(item);
10006 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010007 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 res_offset += itemlen;
10009 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010010 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010011 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010012 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010015 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017
Benjamin Peterson29060642009-01-31 22:14:21 +000010018 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010020 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 return NULL;
10022}
10023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024#define FILL(kind, data, value, start, length) \
10025 do { \
10026 Py_ssize_t i_ = 0; \
10027 assert(kind != PyUnicode_WCHAR_KIND); \
10028 switch ((kind)) { \
10029 case PyUnicode_1BYTE_KIND: { \
10030 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010031 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 break; \
10033 } \
10034 case PyUnicode_2BYTE_KIND: { \
10035 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10036 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10037 break; \
10038 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010039 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10041 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10042 break; \
10043 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010044 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 } \
10046 } while (0)
10047
Victor Stinnerd3f08822012-05-29 12:57:52 +020010048void
10049_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10050 Py_UCS4 fill_char)
10051{
10052 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10053 const void *data = PyUnicode_DATA(unicode);
10054 assert(PyUnicode_IS_READY(unicode));
10055 assert(unicode_modifiable(unicode));
10056 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10057 assert(start >= 0);
10058 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10059 FILL(kind, data, fill_char, start, length);
10060}
10061
Victor Stinner3fe55312012-01-04 00:33:50 +010010062Py_ssize_t
10063PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10064 Py_UCS4 fill_char)
10065{
10066 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010067
10068 if (!PyUnicode_Check(unicode)) {
10069 PyErr_BadInternalCall();
10070 return -1;
10071 }
10072 if (PyUnicode_READY(unicode) == -1)
10073 return -1;
10074 if (unicode_check_modifiable(unicode))
10075 return -1;
10076
Victor Stinnerd3f08822012-05-29 12:57:52 +020010077 if (start < 0) {
10078 PyErr_SetString(PyExc_IndexError, "string index out of range");
10079 return -1;
10080 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010081 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10082 PyErr_SetString(PyExc_ValueError,
10083 "fill character is bigger than "
10084 "the string maximum character");
10085 return -1;
10086 }
10087
10088 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10089 length = Py_MIN(maxlen, length);
10090 if (length <= 0)
10091 return 0;
10092
Victor Stinnerd3f08822012-05-29 12:57:52 +020010093 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010094 return length;
10095}
10096
Victor Stinner9310abb2011-10-05 00:59:23 +020010097static PyObject *
10098pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010099 Py_ssize_t left,
10100 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 PyObject *u;
10104 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010105 int kind;
10106 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
10108 if (left < 0)
10109 left = 0;
10110 if (right < 0)
10111 right = 0;
10112
Victor Stinnerc4b49542011-12-11 22:44:26 +010010113 if (left == 0 && right == 0)
10114 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10117 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010118 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10119 return NULL;
10120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010122 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010124 if (!u)
10125 return NULL;
10126
10127 kind = PyUnicode_KIND(u);
10128 data = PyUnicode_DATA(u);
10129 if (left)
10130 FILL(kind, data, fill, 0, left);
10131 if (right)
10132 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010134 assert(_PyUnicode_CheckConsistency(u, 1));
10135 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
Alexander Belopolsky40018472011-02-26 01:02:56 +000010138PyObject *
10139PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010143 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
Benjamin Petersonead6b532011-12-20 17:23:42 -060010146 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 if (PyUnicode_IS_ASCII(string))
10149 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010150 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010151 PyUnicode_GET_LENGTH(string), keepends);
10152 else
10153 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010154 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010155 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 break;
10157 case PyUnicode_2BYTE_KIND:
10158 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 PyUnicode_GET_LENGTH(string), keepends);
10161 break;
10162 case PyUnicode_4BYTE_KIND:
10163 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 PyUnicode_GET_LENGTH(string), keepends);
10166 break;
10167 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010168 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
Alexander Belopolsky40018472011-02-26 01:02:56 +000010173static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010174split(PyObject *self,
10175 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010176 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010178 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 void *buf1, *buf2;
10180 Py_ssize_t len1, len2;
10181 PyObject* out;
10182
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010184 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (PyUnicode_READY(self) == -1)
10187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010190 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 if (PyUnicode_IS_ASCII(self))
10193 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 PyUnicode_GET_LENGTH(self), maxcount
10196 );
10197 else
10198 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200 PyUnicode_GET_LENGTH(self), maxcount
10201 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 case PyUnicode_2BYTE_KIND:
10203 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010204 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 PyUnicode_GET_LENGTH(self), maxcount
10206 );
10207 case PyUnicode_4BYTE_KIND:
10208 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 PyUnicode_GET_LENGTH(self), maxcount
10211 );
10212 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010213 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 }
10215
10216 if (PyUnicode_READY(substring) == -1)
10217 return NULL;
10218
10219 kind1 = PyUnicode_KIND(self);
10220 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 len1 = PyUnicode_GET_LENGTH(self);
10222 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010223 if (kind1 < kind2 || len1 < len2) {
10224 out = PyList_New(1);
10225 if (out == NULL)
10226 return NULL;
10227 Py_INCREF(self);
10228 PyList_SET_ITEM(out, 0, self);
10229 return out;
10230 }
10231 buf1 = PyUnicode_DATA(self);
10232 buf2 = PyUnicode_DATA(substring);
10233 if (kind2 != kind1) {
10234 buf2 = _PyUnicode_AsKind(substring, kind1);
10235 if (!buf2)
10236 return NULL;
10237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010239 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010241 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10242 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010244 else
10245 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 break;
10248 case PyUnicode_2BYTE_KIND:
10249 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 break;
10252 case PyUnicode_4BYTE_KIND:
10253 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 break;
10256 default:
10257 out = NULL;
10258 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010259 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 PyMem_Free(buf2);
10261 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262}
10263
Alexander Belopolsky40018472011-02-26 01:02:56 +000010264static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010265rsplit(PyObject *self,
10266 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010267 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010268{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010269 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 void *buf1, *buf2;
10271 Py_ssize_t len1, len2;
10272 PyObject* out;
10273
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010274 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010275 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (PyUnicode_READY(self) == -1)
10278 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010281 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 if (PyUnicode_IS_ASCII(self))
10284 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010286 PyUnicode_GET_LENGTH(self), maxcount
10287 );
10288 else
10289 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 PyUnicode_GET_LENGTH(self), maxcount
10292 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 case PyUnicode_2BYTE_KIND:
10294 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 PyUnicode_GET_LENGTH(self), maxcount
10297 );
10298 case PyUnicode_4BYTE_KIND:
10299 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyUnicode_GET_LENGTH(self), maxcount
10302 );
10303 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010304 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 }
10306
10307 if (PyUnicode_READY(substring) == -1)
10308 return NULL;
10309
10310 kind1 = PyUnicode_KIND(self);
10311 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 len1 = PyUnicode_GET_LENGTH(self);
10313 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 if (kind1 < kind2 || len1 < len2) {
10315 out = PyList_New(1);
10316 if (out == NULL)
10317 return NULL;
10318 Py_INCREF(self);
10319 PyList_SET_ITEM(out, 0, self);
10320 return out;
10321 }
10322 buf1 = PyUnicode_DATA(self);
10323 buf2 = PyUnicode_DATA(substring);
10324 if (kind2 != kind1) {
10325 buf2 = _PyUnicode_AsKind(substring, kind1);
10326 if (!buf2)
10327 return NULL;
10328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010330 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10333 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 else
10336 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 break;
10339 case PyUnicode_2BYTE_KIND:
10340 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 break;
10343 case PyUnicode_4BYTE_KIND:
10344 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 break;
10347 default:
10348 out = NULL;
10349 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010350 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 PyMem_Free(buf2);
10352 return out;
10353}
10354
10355static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10357 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10362 return asciilib_find(buf1, len1, buf2, len2, offset);
10363 else
10364 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 case PyUnicode_2BYTE_KIND:
10366 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10367 case PyUnicode_4BYTE_KIND:
10368 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10369 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010370 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371}
10372
10373static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10375 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010377 switch (kind) {
10378 case PyUnicode_1BYTE_KIND:
10379 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10380 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10381 else
10382 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10383 case PyUnicode_2BYTE_KIND:
10384 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10385 case PyUnicode_4BYTE_KIND:
10386 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10387 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010388 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010389}
10390
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010391static void
10392replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10393 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10394{
10395 int kind = PyUnicode_KIND(u);
10396 void *data = PyUnicode_DATA(u);
10397 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10398 if (kind == PyUnicode_1BYTE_KIND) {
10399 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10400 (Py_UCS1 *)data + len,
10401 u1, u2, maxcount);
10402 }
10403 else if (kind == PyUnicode_2BYTE_KIND) {
10404 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10405 (Py_UCS2 *)data + len,
10406 u1, u2, maxcount);
10407 }
10408 else {
10409 assert(kind == PyUnicode_4BYTE_KIND);
10410 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10411 (Py_UCS4 *)data + len,
10412 u1, u2, maxcount);
10413 }
10414}
10415
Alexander Belopolsky40018472011-02-26 01:02:56 +000010416static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417replace(PyObject *self, PyObject *str1,
10418 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 PyObject *u;
10421 char *sbuf = PyUnicode_DATA(self);
10422 char *buf1 = PyUnicode_DATA(str1);
10423 char *buf2 = PyUnicode_DATA(str2);
10424 int srelease = 0, release1 = 0, release2 = 0;
10425 int skind = PyUnicode_KIND(self);
10426 int kind1 = PyUnicode_KIND(str1);
10427 int kind2 = PyUnicode_KIND(str2);
10428 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10429 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10430 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010431 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010432 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433
10434 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010437 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438
Victor Stinner59de0ee2011-10-07 10:01:28 +020010439 if (str1 == str2)
10440 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010443 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10444 if (maxchar < maxchar_str1)
10445 /* substring too wide to be present */
10446 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010447 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10448 /* Replacing str1 with str2 may cause a maxchar reduction in the
10449 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010451 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010456 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010458 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010459 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010460 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010461
Victor Stinner69ed0f42013-04-09 21:48:24 +020010462 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010463 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010464 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010466 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010470
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010471 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10472 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010473 }
10474 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 int rkind = skind;
10476 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010477 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 if (kind1 < rkind) {
10480 /* widen substring */
10481 buf1 = _PyUnicode_AsKind(str1, rkind);
10482 if (!buf1) goto error;
10483 release1 = 1;
10484 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010485 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 if (i < 0)
10487 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (rkind > kind2) {
10489 /* widen replacement */
10490 buf2 = _PyUnicode_AsKind(str2, rkind);
10491 if (!buf2) goto error;
10492 release2 = 1;
10493 }
10494 else if (rkind < kind2) {
10495 /* widen self and buf1 */
10496 rkind = kind2;
10497 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010498 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 sbuf = _PyUnicode_AsKind(self, rkind);
10500 if (!sbuf) goto error;
10501 srelease = 1;
10502 buf1 = _PyUnicode_AsKind(str1, rkind);
10503 if (!buf1) goto error;
10504 release1 = 1;
10505 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 u = PyUnicode_New(slen, maxchar);
10507 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010509 assert(PyUnicode_KIND(u) == rkind);
10510 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010511
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010513 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010514 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010516 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010518
10519 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010520 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010522 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010523 if (i == -1)
10524 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010527 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 }
10532 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010534 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 int rkind = skind;
10536 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 buf1 = _PyUnicode_AsKind(str1, rkind);
10541 if (!buf1) goto error;
10542 release1 = 1;
10543 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010544 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 if (n == 0)
10546 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010548 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 buf2 = _PyUnicode_AsKind(str2, rkind);
10550 if (!buf2) goto error;
10551 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010554 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 rkind = kind2;
10556 sbuf = _PyUnicode_AsKind(self, rkind);
10557 if (!sbuf) goto error;
10558 srelease = 1;
10559 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010560 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 buf1 = _PyUnicode_AsKind(str1, rkind);
10562 if (!buf1) goto error;
10563 release1 = 1;
10564 }
10565 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10566 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010567 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 PyErr_SetString(PyExc_OverflowError,
10569 "replace string is too long");
10570 goto error;
10571 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010572 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010574 _Py_INCREF_UNICODE_EMPTY();
10575 if (!unicode_empty)
10576 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010577 u = unicode_empty;
10578 goto done;
10579 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010580 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 PyErr_SetString(PyExc_OverflowError,
10582 "replace string is too long");
10583 goto error;
10584 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010585 u = PyUnicode_New(new_size, maxchar);
10586 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 assert(PyUnicode_KIND(u) == rkind);
10589 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 ires = i = 0;
10591 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 while (n-- > 0) {
10593 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010594 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010596 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010597 if (j == -1)
10598 break;
10599 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010601 memcpy(res + rkind * ires,
10602 sbuf + rkind * i,
10603 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 }
10606 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010610 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * ires,
10618 sbuf + rkind * i,
10619 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 }
10621 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 /* interleave */
10623 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010628 if (--n <= 0)
10629 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 memcpy(res + rkind * ires,
10631 sbuf + rkind * i,
10632 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 ires++;
10634 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010636 memcpy(res + rkind * ires,
10637 sbuf + rkind * i,
10638 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 }
10641
10642 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010643 unicode_adjust_maxchar(&u);
10644 if (u == NULL)
10645 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010647
10648 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (srelease)
10650 PyMem_FREE(sbuf);
10651 if (release1)
10652 PyMem_FREE(buf1);
10653 if (release2)
10654 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010655 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (srelease)
10661 PyMem_FREE(sbuf);
10662 if (release1)
10663 PyMem_FREE(buf1);
10664 if (release2)
10665 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010666 return unicode_result_unchanged(self);
10667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 error:
10669 if (srelease && sbuf)
10670 PyMem_FREE(sbuf);
10671 if (release1 && buf1)
10672 PyMem_FREE(buf1);
10673 if (release2 && buf2)
10674 PyMem_FREE(buf2);
10675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676}
10677
10678/* --- Unicode Object Methods --------------------------------------------- */
10679
INADA Naoki3ae20562017-01-16 20:41:20 +090010680/*[clinic input]
10681str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
INADA Naoki3ae20562017-01-16 20:41:20 +090010683Return a version of the string where each word is titlecased.
10684
10685More specifically, words start with uppercased characters and all remaining
10686cased characters have lower case.
10687[clinic start generated code]*/
10688
10689static PyObject *
10690unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010691/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010693 if (PyUnicode_READY(self) == -1)
10694 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010695 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696}
10697
INADA Naoki3ae20562017-01-16 20:41:20 +090010698/*[clinic input]
10699str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700
INADA Naoki3ae20562017-01-16 20:41:20 +090010701Return a capitalized version of the string.
10702
10703More specifically, make the first character have upper case and the rest lower
10704case.
10705[clinic start generated code]*/
10706
10707static PyObject *
10708unicode_capitalize_impl(PyObject *self)
10709/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010711 if (PyUnicode_READY(self) == -1)
10712 return NULL;
10713 if (PyUnicode_GET_LENGTH(self) == 0)
10714 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010715 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716}
10717
INADA Naoki3ae20562017-01-16 20:41:20 +090010718/*[clinic input]
10719str.casefold as unicode_casefold
10720
10721Return a version of the string suitable for caseless comparisons.
10722[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010723
10724static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010725unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010726/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010727{
10728 if (PyUnicode_READY(self) == -1)
10729 return NULL;
10730 if (PyUnicode_IS_ASCII(self))
10731 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010732 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010733}
10734
10735
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010736/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010737
10738static int
10739convert_uc(PyObject *obj, void *addr)
10740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010742
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010743 if (!PyUnicode_Check(obj)) {
10744 PyErr_Format(PyExc_TypeError,
10745 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010746 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 return 0;
10748 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010749 if (PyUnicode_READY(obj) < 0)
10750 return 0;
10751 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010754 return 0;
10755 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010756 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010757 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010758}
10759
INADA Naoki3ae20562017-01-16 20:41:20 +090010760/*[clinic input]
10761str.center as unicode_center
10762
10763 width: Py_ssize_t
10764 fillchar: Py_UCS4 = ' '
10765 /
10766
10767Return a centered string of length width.
10768
10769Padding is done using the specified fill character (default is a space).
10770[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
10772static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010773unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10774/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010776 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777
Benjamin Petersonbac79492012-01-14 13:34:47 -050010778 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 return NULL;
10780
Victor Stinnerc4b49542011-12-11 22:44:26 +010010781 if (PyUnicode_GET_LENGTH(self) >= width)
10782 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
Victor Stinnerc4b49542011-12-11 22:44:26 +010010784 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 left = marg / 2 + (marg & width & 1);
10786
Victor Stinner9310abb2011-10-05 00:59:23 +020010787 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788}
10789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790/* This function assumes that str1 and str2 are readied by the caller. */
10791
Marc-André Lemburge5034372000-08-08 08:04:29 +000010792static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010793unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010794{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010795#define COMPARE(TYPE1, TYPE2) \
10796 do { \
10797 TYPE1* p1 = (TYPE1 *)data1; \
10798 TYPE2* p2 = (TYPE2 *)data2; \
10799 TYPE1* end = p1 + len; \
10800 Py_UCS4 c1, c2; \
10801 for (; p1 != end; p1++, p2++) { \
10802 c1 = *p1; \
10803 c2 = *p2; \
10804 if (c1 != c2) \
10805 return (c1 < c2) ? -1 : 1; \
10806 } \
10807 } \
10808 while (0)
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 int kind1, kind2;
10811 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010812 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 kind1 = PyUnicode_KIND(str1);
10815 kind2 = PyUnicode_KIND(str2);
10816 data1 = PyUnicode_DATA(str1);
10817 data2 = PyUnicode_DATA(str2);
10818 len1 = PyUnicode_GET_LENGTH(str1);
10819 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010820 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010821
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010822 switch(kind1) {
10823 case PyUnicode_1BYTE_KIND:
10824 {
10825 switch(kind2) {
10826 case PyUnicode_1BYTE_KIND:
10827 {
10828 int cmp = memcmp(data1, data2, len);
10829 /* normalize result of memcmp() into the range [-1; 1] */
10830 if (cmp < 0)
10831 return -1;
10832 if (cmp > 0)
10833 return 1;
10834 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010835 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010836 case PyUnicode_2BYTE_KIND:
10837 COMPARE(Py_UCS1, Py_UCS2);
10838 break;
10839 case PyUnicode_4BYTE_KIND:
10840 COMPARE(Py_UCS1, Py_UCS4);
10841 break;
10842 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010843 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010844 }
10845 break;
10846 }
10847 case PyUnicode_2BYTE_KIND:
10848 {
10849 switch(kind2) {
10850 case PyUnicode_1BYTE_KIND:
10851 COMPARE(Py_UCS2, Py_UCS1);
10852 break;
10853 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010854 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010855 COMPARE(Py_UCS2, Py_UCS2);
10856 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010857 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010858 case PyUnicode_4BYTE_KIND:
10859 COMPARE(Py_UCS2, Py_UCS4);
10860 break;
10861 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010862 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010863 }
10864 break;
10865 }
10866 case PyUnicode_4BYTE_KIND:
10867 {
10868 switch(kind2) {
10869 case PyUnicode_1BYTE_KIND:
10870 COMPARE(Py_UCS4, Py_UCS1);
10871 break;
10872 case PyUnicode_2BYTE_KIND:
10873 COMPARE(Py_UCS4, Py_UCS2);
10874 break;
10875 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010876 {
10877#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10878 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10879 /* normalize result of wmemcmp() into the range [-1; 1] */
10880 if (cmp < 0)
10881 return -1;
10882 if (cmp > 0)
10883 return 1;
10884#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010888 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010889 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010890 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891 }
10892 break;
10893 }
10894 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010895 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010896 }
10897
Victor Stinner770e19e2012-10-04 22:59:45 +020010898 if (len1 == len2)
10899 return 0;
10900 if (len1 < len2)
10901 return -1;
10902 else
10903 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010904
10905#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010906}
10907
Benjamin Peterson621b4302016-09-09 13:54:34 -070010908static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010909unicode_compare_eq(PyObject *str1, PyObject *str2)
10910{
10911 int kind;
10912 void *data1, *data2;
10913 Py_ssize_t len;
10914 int cmp;
10915
Victor Stinnere5567ad2012-10-23 02:48:49 +020010916 len = PyUnicode_GET_LENGTH(str1);
10917 if (PyUnicode_GET_LENGTH(str2) != len)
10918 return 0;
10919 kind = PyUnicode_KIND(str1);
10920 if (PyUnicode_KIND(str2) != kind)
10921 return 0;
10922 data1 = PyUnicode_DATA(str1);
10923 data2 = PyUnicode_DATA(str2);
10924
10925 cmp = memcmp(data1, data2, len * kind);
10926 return (cmp == 0);
10927}
10928
10929
Alexander Belopolsky40018472011-02-26 01:02:56 +000010930int
10931PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10934 if (PyUnicode_READY(left) == -1 ||
10935 PyUnicode_READY(right) == -1)
10936 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010937
10938 /* a string is equal to itself */
10939 if (left == right)
10940 return 0;
10941
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010942 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010944 PyErr_Format(PyExc_TypeError,
10945 "Can't compare %.100s and %.100s",
10946 left->ob_type->tp_name,
10947 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 return -1;
10949}
10950
Martin v. Löwis5b222132007-06-10 09:51:05 +000010951int
10952PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 Py_ssize_t i;
10955 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010957 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958
Victor Stinner910337b2011-10-03 03:20:16 +020010959 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010960 if (!PyUnicode_IS_READY(uni)) {
10961 const wchar_t *ws = _PyUnicode_WSTR(uni);
10962 /* Compare Unicode string and source character set string */
10963 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10964 if (chr != ustr[i])
10965 return (chr < ustr[i]) ? -1 : 1;
10966 }
10967 /* This check keeps Python strings that end in '\0' from comparing equal
10968 to C strings identical up to that point. */
10969 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10970 return 1; /* uni is longer */
10971 if (ustr[i])
10972 return -1; /* str is longer */
10973 return 0;
10974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010976 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010977 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010978 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010979 size_t len, len2 = strlen(str);
10980 int cmp;
10981
10982 len = Py_MIN(len1, len2);
10983 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010984 if (cmp != 0) {
10985 if (cmp < 0)
10986 return -1;
10987 else
10988 return 1;
10989 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010990 if (len1 > len2)
10991 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010992 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010993 return -1; /* str is longer */
10994 return 0;
10995 }
10996 else {
10997 void *data = PyUnicode_DATA(uni);
10998 /* Compare Unicode string and source character set string */
10999 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011000 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011001 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11002 /* This check keeps Python strings that end in '\0' from comparing equal
11003 to C strings identical up to that point. */
11004 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11005 return 1; /* uni is longer */
11006 if (str[i])
11007 return -1; /* str is longer */
11008 return 0;
11009 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011010}
11011
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011012static int
11013non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11014{
11015 size_t i, len;
11016 const wchar_t *p;
11017 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11018 if (strlen(str) != len)
11019 return 0;
11020 p = _PyUnicode_WSTR(unicode);
11021 assert(p);
11022 for (i = 0; i < len; i++) {
11023 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011024 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011025 return 0;
11026 }
11027 return 1;
11028}
11029
11030int
11031_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11032{
11033 size_t len;
11034 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011035 assert(str);
11036#ifndef NDEBUG
11037 for (const char *p = str; *p; p++) {
11038 assert((unsigned char)*p < 128);
11039 }
11040#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011041 if (PyUnicode_READY(unicode) == -1) {
11042 /* Memory error or bad data */
11043 PyErr_Clear();
11044 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11045 }
11046 if (!PyUnicode_IS_ASCII(unicode))
11047 return 0;
11048 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11049 return strlen(str) == len &&
11050 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11051}
11052
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011053int
11054_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11055{
11056 PyObject *right_uni;
11057 Py_hash_t hash;
11058
11059 assert(_PyUnicode_CHECK(left));
11060 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011061#ifndef NDEBUG
11062 for (const char *p = right->string; *p; p++) {
11063 assert((unsigned char)*p < 128);
11064 }
11065#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011066
11067 if (PyUnicode_READY(left) == -1) {
11068 /* memory error or bad data */
11069 PyErr_Clear();
11070 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11071 }
11072
11073 if (!PyUnicode_IS_ASCII(left))
11074 return 0;
11075
11076 right_uni = _PyUnicode_FromId(right); /* borrowed */
11077 if (right_uni == NULL) {
11078 /* memory error or bad data */
11079 PyErr_Clear();
11080 return _PyUnicode_EqualToASCIIString(left, right->string);
11081 }
11082
11083 if (left == right_uni)
11084 return 1;
11085
11086 if (PyUnicode_CHECK_INTERNED(left))
11087 return 0;
11088
INADA Naoki7cc95f52018-01-28 02:07:09 +090011089 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011090 hash = _PyUnicode_HASH(left);
11091 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11092 return 0;
11093
11094 return unicode_compare_eq(left, right_uni);
11095}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011096
Alexander Belopolsky40018472011-02-26 01:02:56 +000011097PyObject *
11098PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011099{
11100 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011101
Victor Stinnere5567ad2012-10-23 02:48:49 +020011102 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11103 Py_RETURN_NOTIMPLEMENTED;
11104
11105 if (PyUnicode_READY(left) == -1 ||
11106 PyUnicode_READY(right) == -1)
11107 return NULL;
11108
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011109 if (left == right) {
11110 switch (op) {
11111 case Py_EQ:
11112 case Py_LE:
11113 case Py_GE:
11114 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011115 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011116 case Py_NE:
11117 case Py_LT:
11118 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011119 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011120 default:
11121 PyErr_BadArgument();
11122 return NULL;
11123 }
11124 }
11125 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011126 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011127 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011128 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011129 }
11130 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011131 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011132 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011133 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011134}
11135
Alexander Belopolsky40018472011-02-26 01:02:56 +000011136int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011137_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11138{
11139 return unicode_eq(aa, bb);
11140}
11141
11142int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011143PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011144{
Victor Stinner77282cb2013-04-14 19:22:47 +020011145 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 void *buf1, *buf2;
11147 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011148 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011149
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011150 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011152 "'in <string>' requires string as left operand, not %.100s",
11153 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011154 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011155 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011156 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011158 if (ensure_unicode(str) < 0)
11159 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162 kind2 = PyUnicode_KIND(substr);
11163 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011164 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011166 len2 = PyUnicode_GET_LENGTH(substr);
11167 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011168 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011169 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011170 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011171 if (len2 == 1) {
11172 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11173 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011174 return result;
11175 }
11176 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011177 buf2 = _PyUnicode_AsKind(substr, kind1);
11178 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011179 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181
Victor Stinner77282cb2013-04-14 19:22:47 +020011182 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 case PyUnicode_1BYTE_KIND:
11184 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11185 break;
11186 case PyUnicode_2BYTE_KIND:
11187 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11188 break;
11189 case PyUnicode_4BYTE_KIND:
11190 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11191 break;
11192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011193 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195
Victor Stinner77282cb2013-04-14 19:22:47 +020011196 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 PyMem_Free(buf2);
11198
Guido van Rossum403d68b2000-03-13 15:55:09 +000011199 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011200}
11201
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202/* Concat to string or Unicode object giving a new Unicode object. */
11203
Alexander Belopolsky40018472011-02-26 01:02:56 +000011204PyObject *
11205PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011208 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011211 if (ensure_unicode(left) < 0)
11212 return NULL;
11213
11214 if (!PyUnicode_Check(right)) {
11215 PyErr_Format(PyExc_TypeError,
11216 "can only concatenate str (not \"%.200s\") to str",
11217 right->ob_type->tp_name);
11218 return NULL;
11219 }
11220 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 if (left == unicode_empty)
11225 return PyUnicode_FromObject(right);
11226 if (right == unicode_empty)
11227 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 left_len = PyUnicode_GET_LENGTH(left);
11230 right_len = PyUnicode_GET_LENGTH(right);
11231 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011232 PyErr_SetString(PyExc_OverflowError,
11233 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011235 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011237
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011238 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11239 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011240 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 result = PyUnicode_New(new_len, maxchar);
11244 if (result == NULL)
11245 return NULL;
11246 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11247 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11248 assert(_PyUnicode_CheckConsistency(result, 1));
11249 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250}
11251
Walter Dörwald1ab83302007-05-18 17:15:44 +000011252void
Victor Stinner23e56682011-10-03 03:54:37 +020011253PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011254{
Victor Stinner23e56682011-10-03 03:54:37 +020011255 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011256 Py_UCS4 maxchar, maxchar2;
11257 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011258
11259 if (p_left == NULL) {
11260 if (!PyErr_Occurred())
11261 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011262 return;
11263 }
Victor Stinner23e56682011-10-03 03:54:37 +020011264 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011265 if (right == NULL || left == NULL
11266 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011267 if (!PyErr_Occurred())
11268 PyErr_BadInternalCall();
11269 goto error;
11270 }
11271
Benjamin Petersonbac79492012-01-14 13:34:47 -050011272 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011273 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011274 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011275 goto error;
11276
Victor Stinner488fa492011-12-12 00:01:39 +010011277 /* Shortcuts */
11278 if (left == unicode_empty) {
11279 Py_DECREF(left);
11280 Py_INCREF(right);
11281 *p_left = right;
11282 return;
11283 }
11284 if (right == unicode_empty)
11285 return;
11286
11287 left_len = PyUnicode_GET_LENGTH(left);
11288 right_len = PyUnicode_GET_LENGTH(right);
11289 if (left_len > PY_SSIZE_T_MAX - right_len) {
11290 PyErr_SetString(PyExc_OverflowError,
11291 "strings are too large to concat");
11292 goto error;
11293 }
11294 new_len = left_len + right_len;
11295
11296 if (unicode_modifiable(left)
11297 && PyUnicode_CheckExact(right)
11298 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011299 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11300 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011301 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011302 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011303 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11304 {
11305 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011306 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011307 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011308
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011309 /* copy 'right' into the newly allocated area of 'left' */
11310 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011311 }
Victor Stinner488fa492011-12-12 00:01:39 +010011312 else {
11313 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11314 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011315 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011316
Victor Stinner488fa492011-12-12 00:01:39 +010011317 /* Concat the two Unicode strings */
11318 res = PyUnicode_New(new_len, maxchar);
11319 if (res == NULL)
11320 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011321 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11322 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011323 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011324 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011325 }
11326 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011327 return;
11328
11329error:
Victor Stinner488fa492011-12-12 00:01:39 +010011330 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011331}
11332
11333void
11334PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011336 PyUnicode_Append(pleft, right);
11337 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011338}
11339
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011340/*
11341Wraps stringlib_parse_args_finds() and additionally ensures that the
11342first argument is a unicode object.
11343*/
11344
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011345static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011346parse_args_finds_unicode(const char * function_name, PyObject *args,
11347 PyObject **substring,
11348 Py_ssize_t *start, Py_ssize_t *end)
11349{
11350 if(stringlib_parse_args_finds(function_name, args, substring,
11351 start, end)) {
11352 if (ensure_unicode(*substring) < 0)
11353 return 0;
11354 return 1;
11355 }
11356 return 0;
11357}
11358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011362Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011363string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
11366static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011367unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011369 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011370 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011371 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011373 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 void *buf1, *buf2;
11375 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011377 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 kind1 = PyUnicode_KIND(self);
11381 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011383 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 len1 = PyUnicode_GET_LENGTH(self);
11386 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011388 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011389 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011390
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011391 buf1 = PyUnicode_DATA(self);
11392 buf2 = PyUnicode_DATA(substring);
11393 if (kind2 != kind1) {
11394 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011395 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011396 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011397 }
11398 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 case PyUnicode_1BYTE_KIND:
11400 iresult = ucs1lib_count(
11401 ((Py_UCS1*)buf1) + start, end - start,
11402 buf2, len2, PY_SSIZE_T_MAX
11403 );
11404 break;
11405 case PyUnicode_2BYTE_KIND:
11406 iresult = ucs2lib_count(
11407 ((Py_UCS2*)buf1) + start, end - start,
11408 buf2, len2, PY_SSIZE_T_MAX
11409 );
11410 break;
11411 case PyUnicode_4BYTE_KIND:
11412 iresult = ucs4lib_count(
11413 ((Py_UCS4*)buf1) + start, end - start,
11414 buf2, len2, PY_SSIZE_T_MAX
11415 );
11416 break;
11417 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011418 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 }
11420
11421 result = PyLong_FromSsize_t(iresult);
11422
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011423 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 return result;
11427}
11428
INADA Naoki3ae20562017-01-16 20:41:20 +090011429/*[clinic input]
11430str.encode as unicode_encode
11431
11432 encoding: str(c_default="NULL") = 'utf-8'
11433 The encoding in which to encode the string.
11434 errors: str(c_default="NULL") = 'strict'
11435 The error handling scheme to use for encoding errors.
11436 The default is 'strict' meaning that encoding errors raise a
11437 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11438 'xmlcharrefreplace' as well as any other name registered with
11439 codecs.register_error that can handle UnicodeEncodeErrors.
11440
11441Encode the string using the codec registered for encoding.
11442[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011445unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011446/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011448 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011449}
11450
INADA Naoki3ae20562017-01-16 20:41:20 +090011451/*[clinic input]
11452str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
INADA Naoki3ae20562017-01-16 20:41:20 +090011454 tabsize: int = 8
11455
11456Return a copy where all tab characters are expanded using spaces.
11457
11458If tabsize is not given, a tab size of 8 characters is assumed.
11459[clinic start generated code]*/
11460
11461static PyObject *
11462unicode_expandtabs_impl(PyObject *self, int tabsize)
11463/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 Py_ssize_t i, j, line_pos, src_len, incr;
11466 Py_UCS4 ch;
11467 PyObject *u;
11468 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011470 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Antoine Pitrou22425222011-10-04 19:10:51 +020011472 if (PyUnicode_READY(self) == -1)
11473 return NULL;
11474
Thomas Wouters7e474022000-07-16 12:04:32 +000011475 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011476 src_len = PyUnicode_GET_LENGTH(self);
11477 i = j = line_pos = 0;
11478 kind = PyUnicode_KIND(self);
11479 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011480 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 for (; i < src_len; i++) {
11482 ch = PyUnicode_READ(kind, src_data, i);
11483 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011484 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 goto overflow;
11489 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011491 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 goto overflow;
11496 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 if (ch == '\n' || ch == '\r')
11499 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011501 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011502 if (!found)
11503 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011504
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 if (!u)
11508 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011509 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 for (; i < src_len; i++) {
11514 ch = PyUnicode_READ(kind, src_data, i);
11515 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 incr = tabsize - (line_pos % tabsize);
11518 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011519 FILL(kind, dest_data, ' ', j, incr);
11520 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011522 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011524 line_pos++;
11525 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011526 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 if (ch == '\n' || ch == '\r')
11528 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 }
11531 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011532 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011533
Antoine Pitroue71d5742011-10-04 15:55:09 +020011534 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011535 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537}
11538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011539PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541\n\
11542Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011543such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544arguments start and end are interpreted as in slice notation.\n\
11545\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
11548static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011551 /* initialize variables to prevent gcc warning */
11552 PyObject *substring = NULL;
11553 Py_ssize_t start = 0;
11554 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011555 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011557 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011560 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011563 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 if (result == -2)
11566 return NULL;
11567
Christian Heimes217cfd12007-12-02 14:31:20 +000011568 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569}
11570
11571static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011572unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011574 void *data;
11575 enum PyUnicode_Kind kind;
11576 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011577
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011578 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011579 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011581 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011582 if (PyUnicode_READY(self) == -1) {
11583 return NULL;
11584 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011585 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11586 PyErr_SetString(PyExc_IndexError, "string index out of range");
11587 return NULL;
11588 }
11589 kind = PyUnicode_KIND(self);
11590 data = PyUnicode_DATA(self);
11591 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011592 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Guido van Rossumc2504932007-09-18 19:42:40 +000011595/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011596 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011597static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011598unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599{
Guido van Rossumc2504932007-09-18 19:42:40 +000011600 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011601 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011602
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011603#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011604 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011605#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 if (_PyUnicode_HASH(self) != -1)
11607 return _PyUnicode_HASH(self);
11608 if (PyUnicode_READY(self) == -1)
11609 return -1;
11610 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011611 /*
11612 We make the hash of the empty string be 0, rather than using
11613 (prefix ^ suffix), since this slightly obfuscates the hash secret
11614 */
11615 if (len == 0) {
11616 _PyUnicode_HASH(self) = 0;
11617 return 0;
11618 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011619 x = _Py_HashBytes(PyUnicode_DATA(self),
11620 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011622 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627\n\
oldkaa0735f2018-02-02 16:52:55 +080011628Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011629such that sub is contained within S[start:end]. Optional\n\
11630arguments start and end are interpreted as in slice notation.\n\
11631\n\
11632Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
11634static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011637 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011638 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011639 PyObject *substring = NULL;
11640 Py_ssize_t start = 0;
11641 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011643 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (result == -2)
11652 return NULL;
11653
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 if (result < 0) {
11655 PyErr_SetString(PyExc_ValueError, "substring not found");
11656 return NULL;
11657 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011658
Christian Heimes217cfd12007-12-02 14:31:20 +000011659 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
INADA Naoki3ae20562017-01-16 20:41:20 +090011662/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011663str.isascii as unicode_isascii
11664
11665Return True if all characters in the string are ASCII, False otherwise.
11666
11667ASCII characters have code points in the range U+0000-U+007F.
11668Empty string is ASCII too.
11669[clinic start generated code]*/
11670
11671static PyObject *
11672unicode_isascii_impl(PyObject *self)
11673/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11674{
11675 if (PyUnicode_READY(self) == -1) {
11676 return NULL;
11677 }
11678 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11679}
11680
11681/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011682str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
INADA Naoki3ae20562017-01-16 20:41:20 +090011684Return True if the string is a lowercase string, False otherwise.
11685
11686A string is lowercase if all cased characters in the string are lowercase and
11687there is at least one cased character in the string.
11688[clinic start generated code]*/
11689
11690static PyObject *
11691unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011692/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 Py_ssize_t i, length;
11695 int kind;
11696 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 int cased;
11698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (PyUnicode_READY(self) == -1)
11700 return NULL;
11701 length = PyUnicode_GET_LENGTH(self);
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
11704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (length == 1)
11707 return PyBool_FromLong(
11708 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011710 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011712 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011713
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 for (i = 0; i < length; i++) {
11716 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011717
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011719 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 else if (!cased && Py_UNICODE_ISLOWER(ch))
11721 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011723 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
INADA Naoki3ae20562017-01-16 20:41:20 +090011726/*[clinic input]
11727str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
INADA Naoki3ae20562017-01-16 20:41:20 +090011729Return True if the string is an uppercase string, False otherwise.
11730
11731A string is uppercase if all cased characters in the string are uppercase and
11732there is at least one cased character in the string.
11733[clinic start generated code]*/
11734
11735static PyObject *
11736unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011737/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 Py_ssize_t i, length;
11740 int kind;
11741 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 int cased;
11743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (PyUnicode_READY(self) == -1)
11745 return NULL;
11746 length = PyUnicode_GET_LENGTH(self);
11747 kind = PyUnicode_KIND(self);
11748 data = PyUnicode_DATA(self);
11749
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (length == 1)
11752 return PyBool_FromLong(
11753 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011755 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011757 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011758
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 for (i = 0; i < length; i++) {
11761 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011762
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011764 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 else if (!cased && Py_UNICODE_ISUPPER(ch))
11766 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011768 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769}
11770
INADA Naoki3ae20562017-01-16 20:41:20 +090011771/*[clinic input]
11772str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
INADA Naoki3ae20562017-01-16 20:41:20 +090011774Return True if the string is a title-cased string, False otherwise.
11775
11776In a title-cased string, upper- and title-case characters may only
11777follow uncased characters and lowercase characters only cased ones.
11778[clinic start generated code]*/
11779
11780static PyObject *
11781unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011782/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 Py_ssize_t i, length;
11785 int kind;
11786 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 int cased, previous_is_cased;
11788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 if (PyUnicode_READY(self) == -1)
11790 return NULL;
11791 length = PyUnicode_GET_LENGTH(self);
11792 kind = PyUnicode_KIND(self);
11793 data = PyUnicode_DATA(self);
11794
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (length == 1) {
11797 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11798 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11799 (Py_UNICODE_ISUPPER(ch) != 0));
11800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011802 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011804 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 cased = 0;
11807 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011810
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11812 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011813 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 previous_is_cased = 1;
11815 cased = 1;
11816 }
11817 else if (Py_UNICODE_ISLOWER(ch)) {
11818 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011819 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 previous_is_cased = 1;
11821 cased = 1;
11822 }
11823 else
11824 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011826 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827}
11828
INADA Naoki3ae20562017-01-16 20:41:20 +090011829/*[clinic input]
11830str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
INADA Naoki3ae20562017-01-16 20:41:20 +090011832Return True if the string is a whitespace string, False otherwise.
11833
11834A string is whitespace if all characters in the string are whitespace and there
11835is at least one character in the string.
11836[clinic start generated code]*/
11837
11838static PyObject *
11839unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011840/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 Py_ssize_t i, length;
11843 int kind;
11844 void *data;
11845
11846 if (PyUnicode_READY(self) == -1)
11847 return NULL;
11848 length = PyUnicode_GET_LENGTH(self);
11849 kind = PyUnicode_KIND(self);
11850 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (length == 1)
11854 return PyBool_FromLong(
11855 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011857 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011859 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 for (i = 0; i < length; i++) {
11862 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011863 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011864 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011866 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869/*[clinic input]
11870str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011871
INADA Naoki3ae20562017-01-16 20:41:20 +090011872Return True if the string is an alphabetic string, False otherwise.
11873
11874A string is alphabetic if all characters in the string are alphabetic and there
11875is at least one character in the string.
11876[clinic start generated code]*/
11877
11878static PyObject *
11879unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011880/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 Py_ssize_t i, length;
11883 int kind;
11884 void *data;
11885
11886 if (PyUnicode_READY(self) == -1)
11887 return NULL;
11888 length = PyUnicode_GET_LENGTH(self);
11889 kind = PyUnicode_KIND(self);
11890 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011891
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011892 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (length == 1)
11894 return PyBool_FromLong(
11895 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011896
11897 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011899 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 for (i = 0; i < length; i++) {
11902 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011904 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011905 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011906}
11907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908/*[clinic input]
11909str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011910
INADA Naoki3ae20562017-01-16 20:41:20 +090011911Return True if the string is an alpha-numeric string, False otherwise.
11912
11913A string is alpha-numeric if all characters in the string are alpha-numeric and
11914there is at least one character in the string.
11915[clinic start generated code]*/
11916
11917static PyObject *
11918unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011919/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 int kind;
11922 void *data;
11923 Py_ssize_t len, i;
11924
11925 if (PyUnicode_READY(self) == -1)
11926 return NULL;
11927
11928 kind = PyUnicode_KIND(self);
11929 data = PyUnicode_DATA(self);
11930 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (len == 1) {
11934 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11935 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11936 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937
11938 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 for (i = 0; i < len; i++) {
11943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011944 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011945 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011947 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948}
11949
INADA Naoki3ae20562017-01-16 20:41:20 +090011950/*[clinic input]
11951str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
INADA Naoki3ae20562017-01-16 20:41:20 +090011953Return True if the string is a decimal string, False otherwise.
11954
11955A string is a decimal string if all characters in the string are decimal and
11956there is at least one character in the string.
11957[clinic start generated code]*/
11958
11959static PyObject *
11960unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011961/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 Py_ssize_t i, length;
11964 int kind;
11965 void *data;
11966
11967 if (PyUnicode_READY(self) == -1)
11968 return NULL;
11969 length = PyUnicode_GET_LENGTH(self);
11970 kind = PyUnicode_KIND(self);
11971 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (length == 1)
11975 return PyBool_FromLong(
11976 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011978 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 for (i = 0; i < length; i++) {
11983 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011984 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011986 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987}
11988
INADA Naoki3ae20562017-01-16 20:41:20 +090011989/*[clinic input]
11990str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
INADA Naoki3ae20562017-01-16 20:41:20 +090011992Return True if the string is a digit string, False otherwise.
11993
11994A string is a digit string if all characters in the string are digits and there
11995is at least one character in the string.
11996[clinic start generated code]*/
11997
11998static PyObject *
11999unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012000/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 Py_ssize_t i, length;
12003 int kind;
12004 void *data;
12005
12006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008 length = PyUnicode_GET_LENGTH(self);
12009 kind = PyUnicode_KIND(self);
12010 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 1) {
12014 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12015 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012018 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012020 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 for (i = 0; i < length; i++) {
12023 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012024 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027}
12028
INADA Naoki3ae20562017-01-16 20:41:20 +090012029/*[clinic input]
12030str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
INADA Naoki3ae20562017-01-16 20:41:20 +090012032Return True if the string is a numeric string, False otherwise.
12033
12034A string is numeric if all characters in the string are numeric and there is at
12035least one character in the string.
12036[clinic start generated code]*/
12037
12038static PyObject *
12039unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012040/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 Py_ssize_t i, length;
12043 int kind;
12044 void *data;
12045
12046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048 length = PyUnicode_GET_LENGTH(self);
12049 kind = PyUnicode_KIND(self);
12050 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (length == 1)
12054 return PyBool_FromLong(
12055 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012059 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 for (i = 0; i < length; i++) {
12062 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis47383402007-08-15 07:32:56 +000012068int
12069PyUnicode_IsIdentifier(PyObject *self)
12070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 int kind;
12072 void *data;
12073 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012074 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (PyUnicode_READY(self) == -1) {
12077 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 }
12080
12081 /* Special case for empty strings */
12082 if (PyUnicode_GET_LENGTH(self) == 0)
12083 return 0;
12084 kind = PyUnicode_KIND(self);
12085 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012086
12087 /* PEP 3131 says that the first character must be in
12088 XID_Start and subsequent characters in XID_Continue,
12089 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012091 letters, digits, underscore). However, given the current
12092 definition of XID_Start and XID_Continue, it is sufficient
12093 to check just for these, except that _ must be allowed
12094 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012096 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012097 return 0;
12098
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012099 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012102 return 1;
12103}
12104
INADA Naoki3ae20562017-01-16 20:41:20 +090012105/*[clinic input]
12106str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012107
INADA Naoki3ae20562017-01-16 20:41:20 +090012108Return True if the string is a valid Python identifier, False otherwise.
12109
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012110Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012111such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012112[clinic start generated code]*/
12113
12114static PyObject *
12115unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012116/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012117{
12118 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12119}
12120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121/*[clinic input]
12122str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012123
INADA Naoki3ae20562017-01-16 20:41:20 +090012124Return True if the string is printable, False otherwise.
12125
12126A string is printable if all of its characters are considered printable in
12127repr() or if it is empty.
12128[clinic start generated code]*/
12129
12130static PyObject *
12131unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012132/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t i, length;
12135 int kind;
12136 void *data;
12137
12138 if (PyUnicode_READY(self) == -1)
12139 return NULL;
12140 length = PyUnicode_GET_LENGTH(self);
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012143
12144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 1)
12146 return PyBool_FromLong(
12147 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 for (i = 0; i < length; i++) {
12150 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012151 Py_RETURN_FALSE;
12152 }
12153 }
12154 Py_RETURN_TRUE;
12155}
12156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157/*[clinic input]
12158str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160 iterable: object
12161 /
12162
12163Concatenate any number of strings.
12164
Martin Panter91a88662017-01-24 00:30:06 +000012165The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012166The result is returned as a new string.
12167
12168Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12169[clinic start generated code]*/
12170
12171static PyObject *
12172unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012173/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174{
INADA Naoki3ae20562017-01-16 20:41:20 +090012175 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176}
12177
Martin v. Löwis18e16552006-02-15 17:27:45 +000012178static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012179unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (PyUnicode_READY(self) == -1)
12182 return -1;
12183 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184}
12185
INADA Naoki3ae20562017-01-16 20:41:20 +090012186/*[clinic input]
12187str.ljust as unicode_ljust
12188
12189 width: Py_ssize_t
12190 fillchar: Py_UCS4 = ' '
12191 /
12192
12193Return a left-justified string of length width.
12194
12195Padding is done using the specified fill character (default is a space).
12196[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
12198static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012199unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12200/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012202 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
Victor Stinnerc4b49542011-12-11 22:44:26 +010012205 if (PyUnicode_GET_LENGTH(self) >= width)
12206 return unicode_result_unchanged(self);
12207
12208 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209}
12210
INADA Naoki3ae20562017-01-16 20:41:20 +090012211/*[clinic input]
12212str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
INADA Naoki3ae20562017-01-16 20:41:20 +090012214Return a copy of the string converted to lowercase.
12215[clinic start generated code]*/
12216
12217static PyObject *
12218unicode_lower_impl(PyObject *self)
12219/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012221 if (PyUnicode_READY(self) == -1)
12222 return NULL;
12223 if (PyUnicode_IS_ASCII(self))
12224 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012225 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226}
12227
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012228#define LEFTSTRIP 0
12229#define RIGHTSTRIP 1
12230#define BOTHSTRIP 2
12231
12232/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012233static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234
INADA Naoki3ae20562017-01-16 20:41:20 +090012235#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012236
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012237/* externally visible for str.strip(unicode) */
12238PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012239_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 void *data;
12242 int kind;
12243 Py_ssize_t i, j, len;
12244 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012245 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12248 return NULL;
12249
12250 kind = PyUnicode_KIND(self);
12251 data = PyUnicode_DATA(self);
12252 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012253 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12255 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012256 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257
Benjamin Peterson14339b62009-01-31 16:36:08 +000012258 i = 0;
12259 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012260 while (i < len) {
12261 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12262 if (!BLOOM(sepmask, ch))
12263 break;
12264 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12265 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 i++;
12267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269
Benjamin Peterson14339b62009-01-31 16:36:08 +000012270 j = len;
12271 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012272 j--;
12273 while (j >= i) {
12274 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12275 if (!BLOOM(sepmask, ch))
12276 break;
12277 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12278 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012280 }
12281
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012283 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284
Victor Stinner7931d9a2011-11-04 00:22:48 +010012285 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286}
12287
12288PyObject*
12289PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12290{
12291 unsigned char *data;
12292 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012293 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294
Victor Stinnerde636f32011-10-01 03:55:54 +020012295 if (PyUnicode_READY(self) == -1)
12296 return NULL;
12297
Victor Stinner684d5fd2012-05-03 02:32:34 +020012298 length = PyUnicode_GET_LENGTH(self);
12299 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012300
Victor Stinner684d5fd2012-05-03 02:32:34 +020012301 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012302 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303
Victor Stinnerde636f32011-10-01 03:55:54 +020012304 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012305 PyErr_SetString(PyExc_IndexError, "string index out of range");
12306 return NULL;
12307 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012308 if (start >= length || end < start)
12309 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012310
Victor Stinner684d5fd2012-05-03 02:32:34 +020012311 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012312 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012313 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012314 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012315 }
12316 else {
12317 kind = PyUnicode_KIND(self);
12318 data = PyUnicode_1BYTE_DATA(self);
12319 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012320 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012321 length);
12322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324
12325static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012326do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 Py_ssize_t len, i, j;
12329
12330 if (PyUnicode_READY(self) == -1)
12331 return NULL;
12332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334
Victor Stinnercc7af722013-04-09 22:39:24 +020012335 if (PyUnicode_IS_ASCII(self)) {
12336 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12337
12338 i = 0;
12339 if (striptype != RIGHTSTRIP) {
12340 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012341 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012342 if (!_Py_ascii_whitespace[ch])
12343 break;
12344 i++;
12345 }
12346 }
12347
12348 j = len;
12349 if (striptype != LEFTSTRIP) {
12350 j--;
12351 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012352 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012353 if (!_Py_ascii_whitespace[ch])
12354 break;
12355 j--;
12356 }
12357 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012358 }
12359 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012360 else {
12361 int kind = PyUnicode_KIND(self);
12362 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363
Victor Stinnercc7af722013-04-09 22:39:24 +020012364 i = 0;
12365 if (striptype != RIGHTSTRIP) {
12366 while (i < len) {
12367 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12368 if (!Py_UNICODE_ISSPACE(ch))
12369 break;
12370 i++;
12371 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012372 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012373
12374 j = len;
12375 if (striptype != LEFTSTRIP) {
12376 j--;
12377 while (j >= i) {
12378 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12379 if (!Py_UNICODE_ISSPACE(ch))
12380 break;
12381 j--;
12382 }
12383 j++;
12384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386
Victor Stinner7931d9a2011-11-04 00:22:48 +010012387 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388}
12389
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012390
12391static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012392do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012393{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 if (sep != NULL && sep != Py_None) {
12395 if (PyUnicode_Check(sep))
12396 return _PyUnicode_XStrip(self, striptype, sep);
12397 else {
12398 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 "%s arg must be None or str",
12400 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 return NULL;
12402 }
12403 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404
Benjamin Peterson14339b62009-01-31 16:36:08 +000012405 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012406}
12407
12408
INADA Naoki3ae20562017-01-16 20:41:20 +090012409/*[clinic input]
12410str.strip as unicode_strip
12411
12412 chars: object = None
12413 /
12414
Victor Stinner0c4a8282017-01-17 02:21:47 +010012415Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012416
12417If chars is given and not None, remove characters in chars instead.
12418[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419
12420static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012421unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012422/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423{
INADA Naoki3ae20562017-01-16 20:41:20 +090012424 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012425}
12426
12427
INADA Naoki3ae20562017-01-16 20:41:20 +090012428/*[clinic input]
12429str.lstrip as unicode_lstrip
12430
12431 chars: object = NULL
12432 /
12433
12434Return a copy of the string with leading whitespace removed.
12435
12436If chars is given and not None, remove characters in chars instead.
12437[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438
12439static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012440unicode_lstrip_impl(PyObject *self, PyObject *chars)
12441/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442{
INADA Naoki3ae20562017-01-16 20:41:20 +090012443 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012444}
12445
12446
INADA Naoki3ae20562017-01-16 20:41:20 +090012447/*[clinic input]
12448str.rstrip as unicode_rstrip
12449
12450 chars: object = NULL
12451 /
12452
12453Return a copy of the string with trailing whitespace removed.
12454
12455If chars is given and not None, remove characters in chars instead.
12456[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
12458static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012459unicode_rstrip_impl(PyObject *self, PyObject *chars)
12460/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461{
INADA Naoki3ae20562017-01-16 20:41:20 +090012462 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463}
12464
12465
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012467unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012469 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471
Serhiy Storchaka05997252013-01-26 12:14:02 +020012472 if (len < 1)
12473 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474
Victor Stinnerc4b49542011-12-11 22:44:26 +010012475 /* no repeat, return original string */
12476 if (len == 1)
12477 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012478
Benjamin Petersonbac79492012-01-14 13:34:47 -050012479 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 return NULL;
12481
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012482 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012483 PyErr_SetString(PyExc_OverflowError,
12484 "repeated string is too long");
12485 return NULL;
12486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012488
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012489 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490 if (!u)
12491 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012492 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 if (PyUnicode_GET_LENGTH(str) == 1) {
12495 const int kind = PyUnicode_KIND(str);
12496 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012497 if (kind == PyUnicode_1BYTE_KIND) {
12498 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012499 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012500 }
12501 else if (kind == PyUnicode_2BYTE_KIND) {
12502 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012503 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012504 ucs2[n] = fill_char;
12505 } else {
12506 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12507 assert(kind == PyUnicode_4BYTE_KIND);
12508 for (n = 0; n < len; ++n)
12509 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 }
12512 else {
12513 /* number of characters copied this far */
12514 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012515 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012517 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012521 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012522 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524 }
12525
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012526 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012527 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528}
12529
Alexander Belopolsky40018472011-02-26 01:02:56 +000012530PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012531PyUnicode_Replace(PyObject *str,
12532 PyObject *substr,
12533 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012534 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012536 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12537 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012539 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540}
12541
INADA Naoki3ae20562017-01-16 20:41:20 +090012542/*[clinic input]
12543str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
INADA Naoki3ae20562017-01-16 20:41:20 +090012545 old: unicode
12546 new: unicode
12547 count: Py_ssize_t = -1
12548 Maximum number of occurrences to replace.
12549 -1 (the default value) means replace all occurrences.
12550 /
12551
12552Return a copy with all occurrences of substring old replaced by new.
12553
12554If the optional argument count is given, only the first count occurrences are
12555replaced.
12556[clinic start generated code]*/
12557
12558static PyObject *
12559unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12560 Py_ssize_t count)
12561/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012563 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012565 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566}
12567
Alexander Belopolsky40018472011-02-26 01:02:56 +000012568static PyObject *
12569unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012571 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 Py_ssize_t isize;
12573 Py_ssize_t osize, squote, dquote, i, o;
12574 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012575 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012579 return NULL;
12580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 isize = PyUnicode_GET_LENGTH(unicode);
12582 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 /* Compute length of output, quote characters, and
12585 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012586 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 max = 127;
12588 squote = dquote = 0;
12589 ikind = PyUnicode_KIND(unicode);
12590 for (i = 0; i < isize; i++) {
12591 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012592 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012594 case '\'': squote++; break;
12595 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012597 incr = 2;
12598 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 default:
12600 /* Fast-path ASCII */
12601 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012602 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012604 ;
12605 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012608 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012610 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012612 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012614 if (osize > PY_SSIZE_T_MAX - incr) {
12615 PyErr_SetString(PyExc_OverflowError,
12616 "string is too long to generate repr");
12617 return NULL;
12618 }
12619 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 }
12621
12622 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012623 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012625 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 if (dquote)
12627 /* Both squote and dquote present. Use squote,
12628 and escape them */
12629 osize += squote;
12630 else
12631 quote = '"';
12632 }
Victor Stinner55c08782013-04-14 18:45:39 +020012633 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634
12635 repr = PyUnicode_New(osize, max);
12636 if (repr == NULL)
12637 return NULL;
12638 okind = PyUnicode_KIND(repr);
12639 odata = PyUnicode_DATA(repr);
12640
12641 PyUnicode_WRITE(okind, odata, 0, quote);
12642 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012643 if (unchanged) {
12644 _PyUnicode_FastCopyCharacters(repr, 1,
12645 unicode, 0,
12646 isize);
12647 }
12648 else {
12649 for (i = 0, o = 1; i < isize; i++) {
12650 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651
Victor Stinner55c08782013-04-14 18:45:39 +020012652 /* Escape quotes and backslashes */
12653 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012654 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012656 continue;
12657 }
12658
12659 /* Map special whitespace to '\t', \n', '\r' */
12660 if (ch == '\t') {
12661 PyUnicode_WRITE(okind, odata, o++, '\\');
12662 PyUnicode_WRITE(okind, odata, o++, 't');
12663 }
12664 else if (ch == '\n') {
12665 PyUnicode_WRITE(okind, odata, o++, '\\');
12666 PyUnicode_WRITE(okind, odata, o++, 'n');
12667 }
12668 else if (ch == '\r') {
12669 PyUnicode_WRITE(okind, odata, o++, '\\');
12670 PyUnicode_WRITE(okind, odata, o++, 'r');
12671 }
12672
12673 /* Map non-printable US ASCII to '\xhh' */
12674 else if (ch < ' ' || ch == 0x7F) {
12675 PyUnicode_WRITE(okind, odata, o++, '\\');
12676 PyUnicode_WRITE(okind, odata, o++, 'x');
12677 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12678 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12679 }
12680
12681 /* Copy ASCII characters as-is */
12682 else if (ch < 0x7F) {
12683 PyUnicode_WRITE(okind, odata, o++, ch);
12684 }
12685
12686 /* Non-ASCII characters */
12687 else {
12688 /* Map Unicode whitespace and control characters
12689 (categories Z* and C* except ASCII space)
12690 */
12691 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12692 PyUnicode_WRITE(okind, odata, o++, '\\');
12693 /* Map 8-bit characters to '\xhh' */
12694 if (ch <= 0xff) {
12695 PyUnicode_WRITE(okind, odata, o++, 'x');
12696 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12697 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12698 }
12699 /* Map 16-bit characters to '\uxxxx' */
12700 else if (ch <= 0xffff) {
12701 PyUnicode_WRITE(okind, odata, o++, 'u');
12702 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12703 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12704 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12705 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12706 }
12707 /* Map 21-bit characters to '\U00xxxxxx' */
12708 else {
12709 PyUnicode_WRITE(okind, odata, o++, 'U');
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12714 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12715 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12716 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12717 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12718 }
12719 }
12720 /* Copy characters as-is */
12721 else {
12722 PyUnicode_WRITE(okind, odata, o++, ch);
12723 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012724 }
12725 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012728 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012729 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730}
12731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012732PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734\n\
12735Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012736such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737arguments start and end are interpreted as in slice notation.\n\
12738\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012739Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
12741static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012744 /* initialize variables to prevent gcc warning */
12745 PyObject *substring = NULL;
12746 Py_ssize_t start = 0;
12747 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012750 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012753 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012756 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 if (result == -2)
12759 return NULL;
12760
Christian Heimes217cfd12007-12-02 14:31:20 +000012761 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012764PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012767Return the highest index in S where substring sub is found,\n\
12768such that sub is contained within S[start:end]. Optional\n\
12769arguments start and end are interpreted as in slice notation.\n\
12770\n\
12771Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
12773static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012776 /* initialize variables to prevent gcc warning */
12777 PyObject *substring = NULL;
12778 Py_ssize_t start = 0;
12779 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012782 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012785 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012788 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 if (result == -2)
12791 return NULL;
12792
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793 if (result < 0) {
12794 PyErr_SetString(PyExc_ValueError, "substring not found");
12795 return NULL;
12796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797
Christian Heimes217cfd12007-12-02 14:31:20 +000012798 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
INADA Naoki3ae20562017-01-16 20:41:20 +090012801/*[clinic input]
12802str.rjust as unicode_rjust
12803
12804 width: Py_ssize_t
12805 fillchar: Py_UCS4 = ' '
12806 /
12807
12808Return a right-justified string of length width.
12809
12810Padding is done using the specified fill character (default is a space).
12811[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
12813static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012814unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12815/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012817 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818 return NULL;
12819
Victor Stinnerc4b49542011-12-11 22:44:26 +010012820 if (PyUnicode_GET_LENGTH(self) >= width)
12821 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Victor Stinnerc4b49542011-12-11 22:44:26 +010012823 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824}
12825
Alexander Belopolsky40018472011-02-26 01:02:56 +000012826PyObject *
12827PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012832 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833}
12834
INADA Naoki3ae20562017-01-16 20:41:20 +090012835/*[clinic input]
12836str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
INADA Naoki3ae20562017-01-16 20:41:20 +090012838 sep: object = None
12839 The delimiter according which to split the string.
12840 None (the default value) means split according to any whitespace,
12841 and discard empty strings from the result.
12842 maxsplit: Py_ssize_t = -1
12843 Maximum number of splits to do.
12844 -1 (the default value) means no limit.
12845
12846Return a list of the words in the string, using sep as the delimiter string.
12847[clinic start generated code]*/
12848
12849static PyObject *
12850unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12851/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852{
INADA Naoki3ae20562017-01-16 20:41:20 +090012853 if (sep == Py_None)
12854 return split(self, NULL, maxsplit);
12855 if (PyUnicode_Check(sep))
12856 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012857
Victor Stinner998b8062018-09-12 00:23:25 +020012858 PyErr_Format(PyExc_TypeError,
12859 "must be str or None, not %.100s",
12860 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
Thomas Wouters477c8d52006-05-27 19:21:47 +000012864PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012865PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012866{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012868 int kind1, kind2;
12869 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012872 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012874
Victor Stinner14f8f022011-10-05 20:58:25 +020012875 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 len1 = PyUnicode_GET_LENGTH(str_obj);
12878 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012879 if (kind1 < kind2 || len1 < len2) {
12880 _Py_INCREF_UNICODE_EMPTY();
12881 if (!unicode_empty)
12882 out = NULL;
12883 else {
12884 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12885 Py_DECREF(unicode_empty);
12886 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012887 return out;
12888 }
12889 buf1 = PyUnicode_DATA(str_obj);
12890 buf2 = PyUnicode_DATA(sep_obj);
12891 if (kind2 != kind1) {
12892 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12893 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012897 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012899 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12900 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12901 else
12902 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 break;
12904 case PyUnicode_2BYTE_KIND:
12905 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12906 break;
12907 case PyUnicode_4BYTE_KIND:
12908 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12909 break;
12910 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012911 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012913
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012914 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012916
12917 return out;
12918}
12919
12920
12921PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012925 int kind1, kind2;
12926 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 len1 = PyUnicode_GET_LENGTH(str_obj);
12935 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012936 if (kind1 < kind2 || len1 < len2) {
12937 _Py_INCREF_UNICODE_EMPTY();
12938 if (!unicode_empty)
12939 out = NULL;
12940 else {
12941 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12942 Py_DECREF(unicode_empty);
12943 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012944 return out;
12945 }
12946 buf1 = PyUnicode_DATA(str_obj);
12947 buf2 = PyUnicode_DATA(sep_obj);
12948 if (kind2 != kind1) {
12949 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12950 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012951 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012956 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12957 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12958 else
12959 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 break;
12961 case PyUnicode_2BYTE_KIND:
12962 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963 break;
12964 case PyUnicode_4BYTE_KIND:
12965 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966 break;
12967 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012968 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012971 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973
12974 return out;
12975}
12976
INADA Naoki3ae20562017-01-16 20:41:20 +090012977/*[clinic input]
12978str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979
INADA Naoki3ae20562017-01-16 20:41:20 +090012980 sep: object
12981 /
12982
12983Partition the string into three parts using the given separator.
12984
12985This will search for the separator in the string. If the separator is found,
12986returns a 3-tuple containing the part before the separator, the separator
12987itself, and the part after it.
12988
12989If the separator is not found, returns a 3-tuple containing the original string
12990and two empty strings.
12991[clinic start generated code]*/
12992
12993static PyObject *
12994unicode_partition(PyObject *self, PyObject *sep)
12995/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996{
INADA Naoki3ae20562017-01-16 20:41:20 +090012997 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998}
12999
INADA Naoki3ae20562017-01-16 20:41:20 +090013000/*[clinic input]
13001str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002
INADA Naoki3ae20562017-01-16 20:41:20 +090013003Partition the string into three parts using the given separator.
13004
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013005This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013006the separator is found, returns a 3-tuple containing the part before the
13007separator, the separator itself, and the part after it.
13008
13009If the separator is not found, returns a 3-tuple containing two empty strings
13010and the original string.
13011[clinic start generated code]*/
13012
13013static PyObject *
13014unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013015/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013016{
INADA Naoki3ae20562017-01-16 20:41:20 +090013017 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013018}
13019
Alexander Belopolsky40018472011-02-26 01:02:56 +000013020PyObject *
13021PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013022{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013023 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013024 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013025
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013026 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013027}
13028
INADA Naoki3ae20562017-01-16 20:41:20 +090013029/*[clinic input]
13030str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013031
INADA Naoki3ae20562017-01-16 20:41:20 +090013032Return a list of the words in the string, using sep as the delimiter string.
13033
13034Splits are done starting at the end of the string and working to the front.
13035[clinic start generated code]*/
13036
13037static PyObject *
13038unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13039/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013040{
INADA Naoki3ae20562017-01-16 20:41:20 +090013041 if (sep == Py_None)
13042 return rsplit(self, NULL, maxsplit);
13043 if (PyUnicode_Check(sep))
13044 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013045
Victor Stinner998b8062018-09-12 00:23:25 +020013046 PyErr_Format(PyExc_TypeError,
13047 "must be str or None, not %.100s",
13048 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013049 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013050}
13051
INADA Naoki3ae20562017-01-16 20:41:20 +090013052/*[clinic input]
13053str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013055 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013056
13057Return a list of the lines in the string, breaking at line boundaries.
13058
13059Line breaks are not included in the resulting list unless keepends is given and
13060true.
13061[clinic start generated code]*/
13062
13063static PyObject *
13064unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013065/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013067 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068}
13069
13070static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013071PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013073 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074}
13075
INADA Naoki3ae20562017-01-16 20:41:20 +090013076/*[clinic input]
13077str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078
INADA Naoki3ae20562017-01-16 20:41:20 +090013079Convert uppercase characters to lowercase and lowercase characters to uppercase.
13080[clinic start generated code]*/
13081
13082static PyObject *
13083unicode_swapcase_impl(PyObject *self)
13084/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013086 if (PyUnicode_READY(self) == -1)
13087 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013088 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089}
13090
Larry Hastings61272b72014-01-07 12:41:53 -080013091/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013092
Larry Hastings31826802013-10-19 00:09:25 -070013093@staticmethod
13094str.maketrans as unicode_maketrans
13095
13096 x: object
13097
13098 y: unicode=NULL
13099
13100 z: unicode=NULL
13101
13102 /
13103
13104Return a translation table usable for str.translate().
13105
13106If there is only one argument, it must be a dictionary mapping Unicode
13107ordinals (integers) or characters to Unicode ordinals, strings or None.
13108Character keys will be then converted to ordinals.
13109If there are two arguments, they must be strings of equal length, and
13110in the resulting dictionary, each character in x will be mapped to the
13111character at the same position in y. If there is a third argument, it
13112must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013113[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013114
Larry Hastings31826802013-10-19 00:09:25 -070013115static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013116unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013117/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013118{
Georg Brandlceee0772007-11-27 23:48:05 +000013119 PyObject *new = NULL, *key, *value;
13120 Py_ssize_t i = 0;
13121 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122
Georg Brandlceee0772007-11-27 23:48:05 +000013123 new = PyDict_New();
13124 if (!new)
13125 return NULL;
13126 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 int x_kind, y_kind, z_kind;
13128 void *x_data, *y_data, *z_data;
13129
Georg Brandlceee0772007-11-27 23:48:05 +000013130 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013131 if (!PyUnicode_Check(x)) {
13132 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13133 "be a string if there is a second argument");
13134 goto err;
13135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013137 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13138 "arguments must have equal length");
13139 goto err;
13140 }
13141 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 x_kind = PyUnicode_KIND(x);
13143 y_kind = PyUnicode_KIND(y);
13144 x_data = PyUnicode_DATA(x);
13145 y_data = PyUnicode_DATA(y);
13146 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13147 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013148 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013149 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013150 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013151 if (!value) {
13152 Py_DECREF(key);
13153 goto err;
13154 }
Georg Brandlceee0772007-11-27 23:48:05 +000013155 res = PyDict_SetItem(new, key, value);
13156 Py_DECREF(key);
13157 Py_DECREF(value);
13158 if (res < 0)
13159 goto err;
13160 }
13161 /* create entries for deleting chars in z */
13162 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 z_kind = PyUnicode_KIND(z);
13164 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013165 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013167 if (!key)
13168 goto err;
13169 res = PyDict_SetItem(new, key, Py_None);
13170 Py_DECREF(key);
13171 if (res < 0)
13172 goto err;
13173 }
13174 }
13175 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 int kind;
13177 void *data;
13178
Georg Brandlceee0772007-11-27 23:48:05 +000013179 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013180 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013181 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13182 "to maketrans it must be a dict");
13183 goto err;
13184 }
13185 /* copy entries into the new dict, converting string keys to int keys */
13186 while (PyDict_Next(x, &i, &key, &value)) {
13187 if (PyUnicode_Check(key)) {
13188 /* convert string keys to integer keys */
13189 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013190 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013191 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13192 "table must be of length 1");
13193 goto err;
13194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 kind = PyUnicode_KIND(key);
13196 data = PyUnicode_DATA(key);
13197 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013198 if (!newkey)
13199 goto err;
13200 res = PyDict_SetItem(new, newkey, value);
13201 Py_DECREF(newkey);
13202 if (res < 0)
13203 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013204 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013205 /* just keep integer keys */
13206 if (PyDict_SetItem(new, key, value) < 0)
13207 goto err;
13208 } else {
13209 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13210 "be strings or integers");
13211 goto err;
13212 }
13213 }
13214 }
13215 return new;
13216 err:
13217 Py_DECREF(new);
13218 return NULL;
13219}
13220
INADA Naoki3ae20562017-01-16 20:41:20 +090013221/*[clinic input]
13222str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224 table: object
13225 Translation table, which must be a mapping of Unicode ordinals to
13226 Unicode ordinals, strings, or None.
13227 /
13228
13229Replace each character in the string using the given translation table.
13230
13231The table must implement lookup/indexing via __getitem__, for instance a
13232dictionary or list. If this operation raises LookupError, the character is
13233left untouched. Characters mapped to None are deleted.
13234[clinic start generated code]*/
13235
13236static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013238/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013240 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241}
13242
INADA Naoki3ae20562017-01-16 20:41:20 +090013243/*[clinic input]
13244str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245
INADA Naoki3ae20562017-01-16 20:41:20 +090013246Return a copy of the string converted to uppercase.
13247[clinic start generated code]*/
13248
13249static PyObject *
13250unicode_upper_impl(PyObject *self)
13251/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013253 if (PyUnicode_READY(self) == -1)
13254 return NULL;
13255 if (PyUnicode_IS_ASCII(self))
13256 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013257 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258}
13259
INADA Naoki3ae20562017-01-16 20:41:20 +090013260/*[clinic input]
13261str.zfill as unicode_zfill
13262
13263 width: Py_ssize_t
13264 /
13265
13266Pad a numeric string with zeros on the left, to fill a field of the given width.
13267
13268The string is never truncated.
13269[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
13271static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013272unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013273/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013275 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013276 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 int kind;
13278 void *data;
13279 Py_UCS4 chr;
13280
Benjamin Petersonbac79492012-01-14 13:34:47 -050013281 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
Victor Stinnerc4b49542011-12-11 22:44:26 +010013284 if (PyUnicode_GET_LENGTH(self) >= width)
13285 return unicode_result_unchanged(self);
13286
13287 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288
13289 u = pad(self, fill, 0, '0');
13290
Walter Dörwald068325e2002-04-15 13:36:47 +000013291 if (u == NULL)
13292 return NULL;
13293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 kind = PyUnicode_KIND(u);
13295 data = PyUnicode_DATA(u);
13296 chr = PyUnicode_READ(kind, data, fill);
13297
13298 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 PyUnicode_WRITE(kind, data, 0, chr);
13301 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302 }
13303
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013304 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013305 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307
13308#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013309static PyObject *
13310unicode__decimal2ascii(PyObject *self)
13311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013313}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314#endif
13315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013316PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013319Return True if S starts with the specified prefix, False otherwise.\n\
13320With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013321With optional end, stop comparing S at that position.\n\
13322prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323
13324static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013325unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013326 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013328 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013329 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013330 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013331 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013332 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333
Jesus Ceaac451502011-04-20 17:09:23 +020013334 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013336 if (PyTuple_Check(subobj)) {
13337 Py_ssize_t i;
13338 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013339 substring = PyTuple_GET_ITEM(subobj, i);
13340 if (!PyUnicode_Check(substring)) {
13341 PyErr_Format(PyExc_TypeError,
13342 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013343 "not %.100s",
13344 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013345 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013346 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013347 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013348 if (result == -1)
13349 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013350 if (result) {
13351 Py_RETURN_TRUE;
13352 }
13353 }
13354 /* nothing matched */
13355 Py_RETURN_FALSE;
13356 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013357 if (!PyUnicode_Check(subobj)) {
13358 PyErr_Format(PyExc_TypeError,
13359 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013360 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013362 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013363 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013364 if (result == -1)
13365 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013366 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367}
13368
13369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013370PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013373Return True if S ends with the specified suffix, False otherwise.\n\
13374With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013375With optional end, stop comparing S at that position.\n\
13376suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377
13378static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013383 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013384 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013385 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387
Jesus Ceaac451502011-04-20 17:09:23 +020013388 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013390 if (PyTuple_Check(subobj)) {
13391 Py_ssize_t i;
13392 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013393 substring = PyTuple_GET_ITEM(subobj, i);
13394 if (!PyUnicode_Check(substring)) {
13395 PyErr_Format(PyExc_TypeError,
13396 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013397 "not %.100s",
13398 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013402 if (result == -1)
13403 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 if (result) {
13405 Py_RETURN_TRUE;
13406 }
13407 }
13408 Py_RETURN_FALSE;
13409 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013410 if (!PyUnicode_Check(subobj)) {
13411 PyErr_Format(PyExc_TypeError,
13412 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013413 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013415 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013416 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013417 if (result == -1)
13418 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420}
13421
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013422static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013423_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013424{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013425 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13426 writer->data = PyUnicode_DATA(writer->buffer);
13427
13428 if (!writer->readonly) {
13429 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013430 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013431 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013432 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013433 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13434 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13435 writer->kind = PyUnicode_WCHAR_KIND;
13436 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13437
Victor Stinner8f674cc2013-04-17 23:02:17 +020013438 /* Copy-on-write mode: set buffer size to 0 so
13439 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13440 * next write. */
13441 writer->size = 0;
13442 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013443}
13444
Victor Stinnerd3f08822012-05-29 12:57:52 +020013445void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013446_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013447{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013448 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013449
13450 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013451 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013452
13453 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13454 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13455 writer->kind = PyUnicode_WCHAR_KIND;
13456 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013457}
13458
Victor Stinnerd3f08822012-05-29 12:57:52 +020013459int
13460_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13461 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013462{
13463 Py_ssize_t newlen;
13464 PyObject *newbuffer;
13465
Victor Stinner2740e462016-09-06 16:58:36 -070013466 assert(maxchar <= MAX_UNICODE);
13467
Victor Stinnerca9381e2015-09-22 00:58:32 +020013468 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013469 assert((maxchar > writer->maxchar && length >= 0)
13470 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013471
Victor Stinner202fdca2012-05-07 12:47:02 +020013472 if (length > PY_SSIZE_T_MAX - writer->pos) {
13473 PyErr_NoMemory();
13474 return -1;
13475 }
13476 newlen = writer->pos + length;
13477
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013478 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013479
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013481 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013482 if (writer->overallocate
13483 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13484 /* overallocate to limit the number of realloc() */
13485 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013487 if (newlen < writer->min_length)
13488 newlen = writer->min_length;
13489
Victor Stinnerd3f08822012-05-29 12:57:52 +020013490 writer->buffer = PyUnicode_New(newlen, maxchar);
13491 if (writer->buffer == NULL)
13492 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013493 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013494 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013495 if (writer->overallocate
13496 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13497 /* overallocate to limit the number of realloc() */
13498 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013499 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013500 if (newlen < writer->min_length)
13501 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013503 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013504 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013505 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 newbuffer = PyUnicode_New(newlen, maxchar);
13507 if (newbuffer == NULL)
13508 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13510 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013511 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013512 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013513 }
13514 else {
13515 newbuffer = resize_compact(writer->buffer, newlen);
13516 if (newbuffer == NULL)
13517 return -1;
13518 }
13519 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013520 }
13521 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013522 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 newbuffer = PyUnicode_New(writer->size, maxchar);
13524 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013525 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13527 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013528 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013529 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013530 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013531 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013532
13533#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013534}
13535
Victor Stinnerca9381e2015-09-22 00:58:32 +020013536int
13537_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13538 enum PyUnicode_Kind kind)
13539{
13540 Py_UCS4 maxchar;
13541
13542 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13543 assert(writer->kind < kind);
13544
13545 switch (kind)
13546 {
13547 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13548 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13549 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13550 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013551 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013552 }
13553
13554 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13555}
13556
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013557static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013558_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013559{
Victor Stinner2740e462016-09-06 16:58:36 -070013560 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013561 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13562 return -1;
13563 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13564 writer->pos++;
13565 return 0;
13566}
13567
13568int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013569_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13570{
13571 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13572}
13573
13574int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013575_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13576{
13577 Py_UCS4 maxchar;
13578 Py_ssize_t len;
13579
13580 if (PyUnicode_READY(str) == -1)
13581 return -1;
13582 len = PyUnicode_GET_LENGTH(str);
13583 if (len == 0)
13584 return 0;
13585 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13586 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013587 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013588 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013589 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 Py_INCREF(str);
13591 writer->buffer = str;
13592 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593 writer->pos += len;
13594 return 0;
13595 }
13596 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13597 return -1;
13598 }
13599 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13600 str, 0, len);
13601 writer->pos += len;
13602 return 0;
13603}
13604
Victor Stinnere215d962012-10-06 23:03:36 +020013605int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013606_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13607 Py_ssize_t start, Py_ssize_t end)
13608{
13609 Py_UCS4 maxchar;
13610 Py_ssize_t len;
13611
13612 if (PyUnicode_READY(str) == -1)
13613 return -1;
13614
13615 assert(0 <= start);
13616 assert(end <= PyUnicode_GET_LENGTH(str));
13617 assert(start <= end);
13618
13619 if (end == 0)
13620 return 0;
13621
13622 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13623 return _PyUnicodeWriter_WriteStr(writer, str);
13624
13625 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13626 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13627 else
13628 maxchar = writer->maxchar;
13629 len = end - start;
13630
13631 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13632 return -1;
13633
13634 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13635 str, start, len);
13636 writer->pos += len;
13637 return 0;
13638}
13639
13640int
Victor Stinner4a587072013-11-19 12:54:53 +010013641_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13642 const char *ascii, Py_ssize_t len)
13643{
13644 if (len == -1)
13645 len = strlen(ascii);
13646
13647 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13648
13649 if (writer->buffer == NULL && !writer->overallocate) {
13650 PyObject *str;
13651
13652 str = _PyUnicode_FromASCII(ascii, len);
13653 if (str == NULL)
13654 return -1;
13655
13656 writer->readonly = 1;
13657 writer->buffer = str;
13658 _PyUnicodeWriter_Update(writer);
13659 writer->pos += len;
13660 return 0;
13661 }
13662
13663 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13664 return -1;
13665
13666 switch (writer->kind)
13667 {
13668 case PyUnicode_1BYTE_KIND:
13669 {
13670 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13671 Py_UCS1 *data = writer->data;
13672
Christian Heimesf051e432016-09-13 20:22:02 +020013673 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013674 break;
13675 }
13676 case PyUnicode_2BYTE_KIND:
13677 {
13678 _PyUnicode_CONVERT_BYTES(
13679 Py_UCS1, Py_UCS2,
13680 ascii, ascii + len,
13681 (Py_UCS2 *)writer->data + writer->pos);
13682 break;
13683 }
13684 case PyUnicode_4BYTE_KIND:
13685 {
13686 _PyUnicode_CONVERT_BYTES(
13687 Py_UCS1, Py_UCS4,
13688 ascii, ascii + len,
13689 (Py_UCS4 *)writer->data + writer->pos);
13690 break;
13691 }
13692 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013693 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013694 }
13695
13696 writer->pos += len;
13697 return 0;
13698}
13699
13700int
13701_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13702 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013703{
13704 Py_UCS4 maxchar;
13705
13706 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13707 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13708 return -1;
13709 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13710 writer->pos += len;
13711 return 0;
13712}
13713
Victor Stinnerd3f08822012-05-29 12:57:52 +020013714PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013715_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013716{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013717 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013718
Victor Stinnerd3f08822012-05-29 12:57:52 +020013719 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013720 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013721 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013722 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013723
13724 str = writer->buffer;
13725 writer->buffer = NULL;
13726
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013727 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013728 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13729 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013730 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013731
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013732 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13733 PyObject *str2;
13734 str2 = resize_compact(str, writer->pos);
13735 if (str2 == NULL) {
13736 Py_DECREF(str);
13737 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013738 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013739 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013740 }
13741
Victor Stinner15a0bd32013-07-08 22:29:55 +020013742 assert(_PyUnicode_CheckConsistency(str, 1));
13743 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013744}
13745
Victor Stinnerd3f08822012-05-29 12:57:52 +020013746void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013747_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013748{
13749 Py_CLEAR(writer->buffer);
13750}
13751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013752#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013753
13754PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013756\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013757Return a formatted version of S, using substitutions from args and kwargs.\n\
13758The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013759
Eric Smith27bbca62010-11-04 17:06:58 +000013760PyDoc_STRVAR(format_map__doc__,
13761 "S.format_map(mapping) -> str\n\
13762\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013763Return a formatted version of S, using substitutions from mapping.\n\
13764The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013765
INADA Naoki3ae20562017-01-16 20:41:20 +090013766/*[clinic input]
13767str.__format__ as unicode___format__
13768
13769 format_spec: unicode
13770 /
13771
13772Return a formatted version of the string as described by format_spec.
13773[clinic start generated code]*/
13774
Eric Smith4a7d76d2008-05-30 18:10:19 +000013775static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013776unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013777/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013778{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013779 _PyUnicodeWriter writer;
13780 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013781
Victor Stinnerd3f08822012-05-29 12:57:52 +020013782 if (PyUnicode_READY(self) == -1)
13783 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013784 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13786 self, format_spec, 0,
13787 PyUnicode_GET_LENGTH(format_spec));
13788 if (ret == -1) {
13789 _PyUnicodeWriter_Dealloc(&writer);
13790 return NULL;
13791 }
13792 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013793}
13794
INADA Naoki3ae20562017-01-16 20:41:20 +090013795/*[clinic input]
13796str.__sizeof__ as unicode_sizeof
13797
13798Return the size of the string in memory, in bytes.
13799[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013800
13801static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013802unicode_sizeof_impl(PyObject *self)
13803/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 Py_ssize_t size;
13806
13807 /* If it's a compact object, account for base structure +
13808 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013809 if (PyUnicode_IS_COMPACT_ASCII(self))
13810 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13811 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013813 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814 else {
13815 /* If it is a two-block object, account for base object, and
13816 for character block if present. */
13817 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013818 if (_PyUnicode_DATA_ANY(self))
13819 size += (PyUnicode_GET_LENGTH(self) + 1) *
13820 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013821 }
13822 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013823 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013824 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13825 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13826 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13827 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828
13829 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013830}
13831
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013832static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013833unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013834{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013835 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013836 if (!copy)
13837 return NULL;
13838 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013839}
13840
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013842 UNICODE_ENCODE_METHODDEF
13843 UNICODE_REPLACE_METHODDEF
13844 UNICODE_SPLIT_METHODDEF
13845 UNICODE_RSPLIT_METHODDEF
13846 UNICODE_JOIN_METHODDEF
13847 UNICODE_CAPITALIZE_METHODDEF
13848 UNICODE_CASEFOLD_METHODDEF
13849 UNICODE_TITLE_METHODDEF
13850 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013851 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013852 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013853 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013854 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013855 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013856 UNICODE_LJUST_METHODDEF
13857 UNICODE_LOWER_METHODDEF
13858 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013859 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13860 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013861 UNICODE_RJUST_METHODDEF
13862 UNICODE_RSTRIP_METHODDEF
13863 UNICODE_RPARTITION_METHODDEF
13864 UNICODE_SPLITLINES_METHODDEF
13865 UNICODE_STRIP_METHODDEF
13866 UNICODE_SWAPCASE_METHODDEF
13867 UNICODE_TRANSLATE_METHODDEF
13868 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013869 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13870 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013871 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013872 UNICODE_ISLOWER_METHODDEF
13873 UNICODE_ISUPPER_METHODDEF
13874 UNICODE_ISTITLE_METHODDEF
13875 UNICODE_ISSPACE_METHODDEF
13876 UNICODE_ISDECIMAL_METHODDEF
13877 UNICODE_ISDIGIT_METHODDEF
13878 UNICODE_ISNUMERIC_METHODDEF
13879 UNICODE_ISALPHA_METHODDEF
13880 UNICODE_ISALNUM_METHODDEF
13881 UNICODE_ISIDENTIFIER_METHODDEF
13882 UNICODE_ISPRINTABLE_METHODDEF
13883 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013884 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013885 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013886 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013887 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013888 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013889#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013890 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013891 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892#endif
13893
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013894 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895 {NULL, NULL}
13896};
13897
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013898static PyObject *
13899unicode_mod(PyObject *v, PyObject *w)
13900{
Brian Curtindfc80e32011-08-10 20:28:54 -050013901 if (!PyUnicode_Check(v))
13902 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013903 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013904}
13905
13906static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 0, /*nb_add*/
13908 0, /*nb_subtract*/
13909 0, /*nb_multiply*/
13910 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013911};
13912
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013914 (lenfunc) unicode_length, /* sq_length */
13915 PyUnicode_Concat, /* sq_concat */
13916 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13917 (ssizeargfunc) unicode_getitem, /* sq_item */
13918 0, /* sq_slice */
13919 0, /* sq_ass_item */
13920 0, /* sq_ass_slice */
13921 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922};
13923
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013924static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013925unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013927 if (PyUnicode_READY(self) == -1)
13928 return NULL;
13929
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013930 if (PyIndex_Check(item)) {
13931 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013932 if (i == -1 && PyErr_Occurred())
13933 return NULL;
13934 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013935 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013936 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013937 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013938 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013939 PyObject *result;
13940 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013941 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013942 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013943
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013944 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013945 return NULL;
13946 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013947 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13948 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013949
13950 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013951 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013953 slicelength == PyUnicode_GET_LENGTH(self)) {
13954 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013955 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013956 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013957 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013958 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013959 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013960 src_kind = PyUnicode_KIND(self);
13961 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013962 if (!PyUnicode_IS_ASCII(self)) {
13963 kind_limit = kind_maxchar_limit(src_kind);
13964 max_char = 0;
13965 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13966 ch = PyUnicode_READ(src_kind, src_data, cur);
13967 if (ch > max_char) {
13968 max_char = ch;
13969 if (max_char >= kind_limit)
13970 break;
13971 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013972 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013973 }
Victor Stinner55c99112011-10-13 01:17:06 +020013974 else
13975 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013976 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013977 if (result == NULL)
13978 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013979 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013980 dest_data = PyUnicode_DATA(result);
13981
13982 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013983 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13984 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013986 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013987 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988 } else {
13989 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13990 return NULL;
13991 }
13992}
13993
13994static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 (lenfunc)unicode_length, /* mp_length */
13996 (binaryfunc)unicode_subscript, /* mp_subscript */
13997 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013998};
13999
Guido van Rossumd57fd912000-03-10 22:53:23 +000014000
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001/* Helpers for PyUnicode_Format() */
14002
Victor Stinnera47082312012-10-04 02:19:54 +020014003struct unicode_formatter_t {
14004 PyObject *args;
14005 int args_owned;
14006 Py_ssize_t arglen, argidx;
14007 PyObject *dict;
14008
14009 enum PyUnicode_Kind fmtkind;
14010 Py_ssize_t fmtcnt, fmtpos;
14011 void *fmtdata;
14012 PyObject *fmtstr;
14013
14014 _PyUnicodeWriter writer;
14015};
14016
14017struct unicode_format_arg_t {
14018 Py_UCS4 ch;
14019 int flags;
14020 Py_ssize_t width;
14021 int prec;
14022 int sign;
14023};
14024
Guido van Rossumd57fd912000-03-10 22:53:23 +000014025static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014026unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014027{
Victor Stinnera47082312012-10-04 02:19:54 +020014028 Py_ssize_t argidx = ctx->argidx;
14029
14030 if (argidx < ctx->arglen) {
14031 ctx->argidx++;
14032 if (ctx->arglen < 0)
14033 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014034 else
Victor Stinnera47082312012-10-04 02:19:54 +020014035 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036 }
14037 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014038 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039 return NULL;
14040}
14041
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014042/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014043
Victor Stinnera47082312012-10-04 02:19:54 +020014044/* Format a float into the writer if the writer is not NULL, or into *p_output
14045 otherwise.
14046
14047 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014048static int
Victor Stinnera47082312012-10-04 02:19:54 +020014049formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14050 PyObject **p_output,
14051 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014052{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014053 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014055 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014056 int prec;
14057 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014058
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059 x = PyFloat_AsDouble(v);
14060 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014061 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014062
Victor Stinnera47082312012-10-04 02:19:54 +020014063 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014064 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014065 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014066
Victor Stinnera47082312012-10-04 02:19:54 +020014067 if (arg->flags & F_ALT)
14068 dtoa_flags = Py_DTSF_ALT;
14069 else
14070 dtoa_flags = 0;
14071 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014072 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014073 return -1;
14074 len = strlen(p);
14075 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014076 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014077 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014078 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014079 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014080 }
14081 else
14082 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014083 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014085}
14086
Victor Stinnerd0880d52012-04-27 23:40:13 +020014087/* formatlong() emulates the format codes d, u, o, x and X, and
14088 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14089 * Python's regular ints.
14090 * Return value: a new PyUnicodeObject*, or NULL if error.
14091 * The output string is of the form
14092 * "-"? ("0x" | "0X")? digit+
14093 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14094 * set in flags. The case of hex digits will be correct,
14095 * There will be at least prec digits, zero-filled on the left if
14096 * necessary to get that many.
14097 * val object to be converted
14098 * flags bitmask of format flags; only F_ALT is looked at
14099 * prec minimum number of digits; 0-fill on left if needed
14100 * type a character in [duoxX]; u acts the same as d
14101 *
14102 * CAUTION: o, x and X conversions on regular ints can never
14103 * produce a '-' sign, but can for Python's unbounded ints.
14104 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014105PyObject *
14106_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014107{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014108 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014110 Py_ssize_t i;
14111 int sign; /* 1 if '-', else 0 */
14112 int len; /* number of characters */
14113 Py_ssize_t llen;
14114 int numdigits; /* len == numnondigits + numdigits */
14115 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014116
Victor Stinnerd0880d52012-04-27 23:40:13 +020014117 /* Avoid exceeding SSIZE_T_MAX */
14118 if (prec > INT_MAX-3) {
14119 PyErr_SetString(PyExc_OverflowError,
14120 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014122 }
14123
14124 assert(PyLong_Check(val));
14125
14126 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014127 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014128 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014129 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014130 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014131 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014132 /* int and int subclasses should print numerically when a numeric */
14133 /* format code is used (see issue18780) */
14134 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014135 break;
14136 case 'o':
14137 numnondigits = 2;
14138 result = PyNumber_ToBase(val, 8);
14139 break;
14140 case 'x':
14141 case 'X':
14142 numnondigits = 2;
14143 result = PyNumber_ToBase(val, 16);
14144 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014145 }
14146 if (!result)
14147 return NULL;
14148
14149 assert(unicode_modifiable(result));
14150 assert(PyUnicode_IS_READY(result));
14151 assert(PyUnicode_IS_ASCII(result));
14152
14153 /* To modify the string in-place, there can only be one reference. */
14154 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014155 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014156 PyErr_BadInternalCall();
14157 return NULL;
14158 }
14159 buf = PyUnicode_DATA(result);
14160 llen = PyUnicode_GET_LENGTH(result);
14161 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014162 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014163 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014164 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014165 return NULL;
14166 }
14167 len = (int)llen;
14168 sign = buf[0] == '-';
14169 numnondigits += sign;
14170 numdigits = len - numnondigits;
14171 assert(numdigits > 0);
14172
14173 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014174 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014175 (type == 'o' || type == 'x' || type == 'X'))) {
14176 assert(buf[sign] == '0');
14177 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14178 buf[sign+1] == 'o');
14179 numnondigits -= 2;
14180 buf += 2;
14181 len -= 2;
14182 if (sign)
14183 buf[0] = '-';
14184 assert(len == numnondigits + numdigits);
14185 assert(numdigits > 0);
14186 }
14187
14188 /* Fill with leading zeroes to meet minimum width. */
14189 if (prec > numdigits) {
14190 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14191 numnondigits + prec);
14192 char *b1;
14193 if (!r1) {
14194 Py_DECREF(result);
14195 return NULL;
14196 }
14197 b1 = PyBytes_AS_STRING(r1);
14198 for (i = 0; i < numnondigits; ++i)
14199 *b1++ = *buf++;
14200 for (i = 0; i < prec - numdigits; i++)
14201 *b1++ = '0';
14202 for (i = 0; i < numdigits; i++)
14203 *b1++ = *buf++;
14204 *b1 = '\0';
14205 Py_DECREF(result);
14206 result = r1;
14207 buf = PyBytes_AS_STRING(result);
14208 len = numnondigits + prec;
14209 }
14210
14211 /* Fix up case for hex conversions. */
14212 if (type == 'X') {
14213 /* Need to convert all lower case letters to upper case.
14214 and need to convert 0x to 0X (and -0x to -0X). */
14215 for (i = 0; i < len; i++)
14216 if (buf[i] >= 'a' && buf[i] <= 'x')
14217 buf[i] -= 'a'-'A';
14218 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014219 if (!PyUnicode_Check(result)
14220 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014221 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014222 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014223 Py_DECREF(result);
14224 result = unicode;
14225 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014226 else if (len != PyUnicode_GET_LENGTH(result)) {
14227 if (PyUnicode_Resize(&result, len) < 0)
14228 Py_CLEAR(result);
14229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014231}
14232
Ethan Furmandf3ed242014-01-05 06:50:30 -080014233/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014234 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014235 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014236 * -1 and raise an exception on error */
14237static int
Victor Stinnera47082312012-10-04 02:19:54 +020014238mainformatlong(PyObject *v,
14239 struct unicode_format_arg_t *arg,
14240 PyObject **p_output,
14241 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014242{
14243 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014244 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014245
14246 if (!PyNumber_Check(v))
14247 goto wrongtype;
14248
Ethan Furman9ab74802014-03-21 06:38:46 -070014249 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014250 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014251 if (type == 'o' || type == 'x' || type == 'X') {
14252 iobj = PyNumber_Index(v);
14253 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014254 if (PyErr_ExceptionMatches(PyExc_TypeError))
14255 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014256 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014257 }
14258 }
14259 else {
14260 iobj = PyNumber_Long(v);
14261 if (iobj == NULL ) {
14262 if (PyErr_ExceptionMatches(PyExc_TypeError))
14263 goto wrongtype;
14264 return -1;
14265 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014266 }
14267 assert(PyLong_Check(iobj));
14268 }
14269 else {
14270 iobj = v;
14271 Py_INCREF(iobj);
14272 }
14273
14274 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014275 && arg->width == -1 && arg->prec == -1
14276 && !(arg->flags & (F_SIGN | F_BLANK))
14277 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278 {
14279 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014280 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 int base;
14282
Victor Stinnera47082312012-10-04 02:19:54 +020014283 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 {
14285 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014286 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 case 'd':
14288 case 'i':
14289 case 'u':
14290 base = 10;
14291 break;
14292 case 'o':
14293 base = 8;
14294 break;
14295 case 'x':
14296 case 'X':
14297 base = 16;
14298 break;
14299 }
14300
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014301 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14302 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014303 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014304 }
14305 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 return 1;
14307 }
14308
Ethan Furmanb95b5612015-01-23 20:05:18 -080014309 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310 Py_DECREF(iobj);
14311 if (res == NULL)
14312 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014313 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014314 return 0;
14315
14316wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014317 switch(type)
14318 {
14319 case 'o':
14320 case 'x':
14321 case 'X':
14322 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014323 "%%%c format: an integer is required, "
14324 "not %.200s",
14325 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014326 break;
14327 default:
14328 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014329 "%%%c format: a number is required, "
14330 "not %.200s",
14331 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014332 break;
14333 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014334 return -1;
14335}
14336
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014337static Py_UCS4
14338formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014339{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014340 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014341 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014342 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014343 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014345 goto onError;
14346 }
14347 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014348 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014349 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014350 /* make sure number is a type of integer */
14351 if (!PyLong_Check(v)) {
14352 iobj = PyNumber_Index(v);
14353 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014354 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014355 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014356 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014357 Py_DECREF(iobj);
14358 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014359 else {
14360 x = PyLong_AsLong(v);
14361 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014362 if (x == -1 && PyErr_Occurred())
14363 goto onError;
14364
Victor Stinner8faf8212011-12-08 22:14:11 +010014365 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014366 PyErr_SetString(PyExc_OverflowError,
14367 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014368 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014369 }
14370
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014371 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014372 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014373
Benjamin Peterson29060642009-01-31 22:14:21 +000014374 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014375 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014376 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014377 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014378}
14379
Victor Stinnera47082312012-10-04 02:19:54 +020014380/* Parse options of an argument: flags, width, precision.
14381 Handle also "%(name)" syntax.
14382
14383 Return 0 if the argument has been formatted into arg->str.
14384 Return 1 if the argument has been written into ctx->writer,
14385 Raise an exception and return -1 on error. */
14386static int
14387unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14388 struct unicode_format_arg_t *arg)
14389{
14390#define FORMAT_READ(ctx) \
14391 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14392
14393 PyObject *v;
14394
Victor Stinnera47082312012-10-04 02:19:54 +020014395 if (arg->ch == '(') {
14396 /* Get argument value from a dictionary. Example: "%(name)s". */
14397 Py_ssize_t keystart;
14398 Py_ssize_t keylen;
14399 PyObject *key;
14400 int pcount = 1;
14401
14402 if (ctx->dict == NULL) {
14403 PyErr_SetString(PyExc_TypeError,
14404 "format requires a mapping");
14405 return -1;
14406 }
14407 ++ctx->fmtpos;
14408 --ctx->fmtcnt;
14409 keystart = ctx->fmtpos;
14410 /* Skip over balanced parentheses */
14411 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14412 arg->ch = FORMAT_READ(ctx);
14413 if (arg->ch == ')')
14414 --pcount;
14415 else if (arg->ch == '(')
14416 ++pcount;
14417 ctx->fmtpos++;
14418 }
14419 keylen = ctx->fmtpos - keystart - 1;
14420 if (ctx->fmtcnt < 0 || pcount > 0) {
14421 PyErr_SetString(PyExc_ValueError,
14422 "incomplete format key");
14423 return -1;
14424 }
14425 key = PyUnicode_Substring(ctx->fmtstr,
14426 keystart, keystart + keylen);
14427 if (key == NULL)
14428 return -1;
14429 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014430 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014431 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014432 }
14433 ctx->args = PyObject_GetItem(ctx->dict, key);
14434 Py_DECREF(key);
14435 if (ctx->args == NULL)
14436 return -1;
14437 ctx->args_owned = 1;
14438 ctx->arglen = -1;
14439 ctx->argidx = -2;
14440 }
14441
14442 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014443 while (--ctx->fmtcnt >= 0) {
14444 arg->ch = FORMAT_READ(ctx);
14445 ctx->fmtpos++;
14446 switch (arg->ch) {
14447 case '-': arg->flags |= F_LJUST; continue;
14448 case '+': arg->flags |= F_SIGN; continue;
14449 case ' ': arg->flags |= F_BLANK; continue;
14450 case '#': arg->flags |= F_ALT; continue;
14451 case '0': arg->flags |= F_ZERO; continue;
14452 }
14453 break;
14454 }
14455
14456 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014457 if (arg->ch == '*') {
14458 v = unicode_format_getnextarg(ctx);
14459 if (v == NULL)
14460 return -1;
14461 if (!PyLong_Check(v)) {
14462 PyErr_SetString(PyExc_TypeError,
14463 "* wants int");
14464 return -1;
14465 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014466 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014467 if (arg->width == -1 && PyErr_Occurred())
14468 return -1;
14469 if (arg->width < 0) {
14470 arg->flags |= F_LJUST;
14471 arg->width = -arg->width;
14472 }
14473 if (--ctx->fmtcnt >= 0) {
14474 arg->ch = FORMAT_READ(ctx);
14475 ctx->fmtpos++;
14476 }
14477 }
14478 else if (arg->ch >= '0' && arg->ch <= '9') {
14479 arg->width = arg->ch - '0';
14480 while (--ctx->fmtcnt >= 0) {
14481 arg->ch = FORMAT_READ(ctx);
14482 ctx->fmtpos++;
14483 if (arg->ch < '0' || arg->ch > '9')
14484 break;
14485 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14486 mixing signed and unsigned comparison. Since arg->ch is between
14487 '0' and '9', casting to int is safe. */
14488 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14489 PyErr_SetString(PyExc_ValueError,
14490 "width too big");
14491 return -1;
14492 }
14493 arg->width = arg->width*10 + (arg->ch - '0');
14494 }
14495 }
14496
14497 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014498 if (arg->ch == '.') {
14499 arg->prec = 0;
14500 if (--ctx->fmtcnt >= 0) {
14501 arg->ch = FORMAT_READ(ctx);
14502 ctx->fmtpos++;
14503 }
14504 if (arg->ch == '*') {
14505 v = unicode_format_getnextarg(ctx);
14506 if (v == NULL)
14507 return -1;
14508 if (!PyLong_Check(v)) {
14509 PyErr_SetString(PyExc_TypeError,
14510 "* wants int");
14511 return -1;
14512 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014513 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014514 if (arg->prec == -1 && PyErr_Occurred())
14515 return -1;
14516 if (arg->prec < 0)
14517 arg->prec = 0;
14518 if (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 }
14522 }
14523 else if (arg->ch >= '0' && arg->ch <= '9') {
14524 arg->prec = arg->ch - '0';
14525 while (--ctx->fmtcnt >= 0) {
14526 arg->ch = FORMAT_READ(ctx);
14527 ctx->fmtpos++;
14528 if (arg->ch < '0' || arg->ch > '9')
14529 break;
14530 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14531 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014532 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014533 return -1;
14534 }
14535 arg->prec = arg->prec*10 + (arg->ch - '0');
14536 }
14537 }
14538 }
14539
14540 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14541 if (ctx->fmtcnt >= 0) {
14542 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14543 if (--ctx->fmtcnt >= 0) {
14544 arg->ch = FORMAT_READ(ctx);
14545 ctx->fmtpos++;
14546 }
14547 }
14548 }
14549 if (ctx->fmtcnt < 0) {
14550 PyErr_SetString(PyExc_ValueError,
14551 "incomplete format");
14552 return -1;
14553 }
14554 return 0;
14555
14556#undef FORMAT_READ
14557}
14558
14559/* Format one argument. Supported conversion specifiers:
14560
14561 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014562 - "i", "d", "u": int or float
14563 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014564 - "e", "E", "f", "F", "g", "G": float
14565 - "c": int or str (1 character)
14566
Victor Stinner8dbd4212012-12-04 09:30:24 +010014567 When possible, the output is written directly into the Unicode writer
14568 (ctx->writer). A string is created when padding is required.
14569
Victor Stinnera47082312012-10-04 02:19:54 +020014570 Return 0 if the argument has been formatted into *p_str,
14571 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014572 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014573static int
14574unicode_format_arg_format(struct unicode_formatter_t *ctx,
14575 struct unicode_format_arg_t *arg,
14576 PyObject **p_str)
14577{
14578 PyObject *v;
14579 _PyUnicodeWriter *writer = &ctx->writer;
14580
14581 if (ctx->fmtcnt == 0)
14582 ctx->writer.overallocate = 0;
14583
Victor Stinnera47082312012-10-04 02:19:54 +020014584 v = unicode_format_getnextarg(ctx);
14585 if (v == NULL)
14586 return -1;
14587
Victor Stinnera47082312012-10-04 02:19:54 +020014588
14589 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014590 case 's':
14591 case 'r':
14592 case 'a':
14593 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14594 /* Fast path */
14595 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14596 return -1;
14597 return 1;
14598 }
14599
14600 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14601 *p_str = v;
14602 Py_INCREF(*p_str);
14603 }
14604 else {
14605 if (arg->ch == 's')
14606 *p_str = PyObject_Str(v);
14607 else if (arg->ch == 'r')
14608 *p_str = PyObject_Repr(v);
14609 else
14610 *p_str = PyObject_ASCII(v);
14611 }
14612 break;
14613
14614 case 'i':
14615 case 'd':
14616 case 'u':
14617 case 'o':
14618 case 'x':
14619 case 'X':
14620 {
14621 int ret = mainformatlong(v, arg, p_str, writer);
14622 if (ret != 0)
14623 return ret;
14624 arg->sign = 1;
14625 break;
14626 }
14627
14628 case 'e':
14629 case 'E':
14630 case 'f':
14631 case 'F':
14632 case 'g':
14633 case 'G':
14634 if (arg->width == -1 && arg->prec == -1
14635 && !(arg->flags & (F_SIGN | F_BLANK)))
14636 {
14637 /* Fast path */
14638 if (formatfloat(v, arg, NULL, writer) == -1)
14639 return -1;
14640 return 1;
14641 }
14642
14643 arg->sign = 1;
14644 if (formatfloat(v, arg, p_str, NULL) == -1)
14645 return -1;
14646 break;
14647
14648 case 'c':
14649 {
14650 Py_UCS4 ch = formatchar(v);
14651 if (ch == (Py_UCS4) -1)
14652 return -1;
14653 if (arg->width == -1 && arg->prec == -1) {
14654 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014655 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014656 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014657 return 1;
14658 }
14659 *p_str = PyUnicode_FromOrdinal(ch);
14660 break;
14661 }
14662
14663 default:
14664 PyErr_Format(PyExc_ValueError,
14665 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014666 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014667 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14668 (int)arg->ch,
14669 ctx->fmtpos - 1);
14670 return -1;
14671 }
14672 if (*p_str == NULL)
14673 return -1;
14674 assert (PyUnicode_Check(*p_str));
14675 return 0;
14676}
14677
14678static int
14679unicode_format_arg_output(struct unicode_formatter_t *ctx,
14680 struct unicode_format_arg_t *arg,
14681 PyObject *str)
14682{
14683 Py_ssize_t len;
14684 enum PyUnicode_Kind kind;
14685 void *pbuf;
14686 Py_ssize_t pindex;
14687 Py_UCS4 signchar;
14688 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014689 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014690 Py_ssize_t sublen;
14691 _PyUnicodeWriter *writer = &ctx->writer;
14692 Py_UCS4 fill;
14693
14694 fill = ' ';
14695 if (arg->sign && arg->flags & F_ZERO)
14696 fill = '0';
14697
14698 if (PyUnicode_READY(str) == -1)
14699 return -1;
14700
14701 len = PyUnicode_GET_LENGTH(str);
14702 if ((arg->width == -1 || arg->width <= len)
14703 && (arg->prec == -1 || arg->prec >= len)
14704 && !(arg->flags & (F_SIGN | F_BLANK)))
14705 {
14706 /* Fast path */
14707 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14708 return -1;
14709 return 0;
14710 }
14711
14712 /* Truncate the string for "s", "r" and "a" formats
14713 if the precision is set */
14714 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14715 if (arg->prec >= 0 && len > arg->prec)
14716 len = arg->prec;
14717 }
14718
14719 /* Adjust sign and width */
14720 kind = PyUnicode_KIND(str);
14721 pbuf = PyUnicode_DATA(str);
14722 pindex = 0;
14723 signchar = '\0';
14724 if (arg->sign) {
14725 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14726 if (ch == '-' || ch == '+') {
14727 signchar = ch;
14728 len--;
14729 pindex++;
14730 }
14731 else if (arg->flags & F_SIGN)
14732 signchar = '+';
14733 else if (arg->flags & F_BLANK)
14734 signchar = ' ';
14735 else
14736 arg->sign = 0;
14737 }
14738 if (arg->width < len)
14739 arg->width = len;
14740
14741 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014742 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014743 if (!(arg->flags & F_LJUST)) {
14744 if (arg->sign) {
14745 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014746 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014747 }
14748 else {
14749 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014750 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014751 }
14752 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014753 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14754 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014755 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014756 }
14757
Victor Stinnera47082312012-10-04 02:19:54 +020014758 buflen = arg->width;
14759 if (arg->sign && len == arg->width)
14760 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014761 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014762 return -1;
14763
14764 /* Write the sign if needed */
14765 if (arg->sign) {
14766 if (fill != ' ') {
14767 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14768 writer->pos += 1;
14769 }
14770 if (arg->width > len)
14771 arg->width--;
14772 }
14773
14774 /* Write the numeric prefix for "x", "X" and "o" formats
14775 if the alternate form is used.
14776 For example, write "0x" for the "%#x" format. */
14777 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14778 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14779 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14780 if (fill != ' ') {
14781 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14782 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14783 writer->pos += 2;
14784 pindex += 2;
14785 }
14786 arg->width -= 2;
14787 if (arg->width < 0)
14788 arg->width = 0;
14789 len -= 2;
14790 }
14791
14792 /* Pad left with the fill character if needed */
14793 if (arg->width > len && !(arg->flags & F_LJUST)) {
14794 sublen = arg->width - len;
14795 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14796 writer->pos += sublen;
14797 arg->width = len;
14798 }
14799
14800 /* If padding with spaces: write sign if needed and/or numeric prefix if
14801 the alternate form is used */
14802 if (fill == ' ') {
14803 if (arg->sign) {
14804 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14805 writer->pos += 1;
14806 }
14807 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14808 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14809 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14810 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14811 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14812 writer->pos += 2;
14813 pindex += 2;
14814 }
14815 }
14816
14817 /* Write characters */
14818 if (len) {
14819 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14820 str, pindex, len);
14821 writer->pos += len;
14822 }
14823
14824 /* Pad right with the fill character if needed */
14825 if (arg->width > len) {
14826 sublen = arg->width - len;
14827 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14828 writer->pos += sublen;
14829 }
14830 return 0;
14831}
14832
14833/* Helper of PyUnicode_Format(): format one arg.
14834 Return 0 on success, raise an exception and return -1 on error. */
14835static int
14836unicode_format_arg(struct unicode_formatter_t *ctx)
14837{
14838 struct unicode_format_arg_t arg;
14839 PyObject *str;
14840 int ret;
14841
Victor Stinner8dbd4212012-12-04 09:30:24 +010014842 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014843 if (arg.ch == '%') {
14844 ctx->fmtpos++;
14845 ctx->fmtcnt--;
14846 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14847 return -1;
14848 return 0;
14849 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014850 arg.flags = 0;
14851 arg.width = -1;
14852 arg.prec = -1;
14853 arg.sign = 0;
14854 str = NULL;
14855
Victor Stinnera47082312012-10-04 02:19:54 +020014856 ret = unicode_format_arg_parse(ctx, &arg);
14857 if (ret == -1)
14858 return -1;
14859
14860 ret = unicode_format_arg_format(ctx, &arg, &str);
14861 if (ret == -1)
14862 return -1;
14863
14864 if (ret != 1) {
14865 ret = unicode_format_arg_output(ctx, &arg, str);
14866 Py_DECREF(str);
14867 if (ret == -1)
14868 return -1;
14869 }
14870
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014871 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014872 PyErr_SetString(PyExc_TypeError,
14873 "not all arguments converted during string formatting");
14874 return -1;
14875 }
14876 return 0;
14877}
14878
Alexander Belopolsky40018472011-02-26 01:02:56 +000014879PyObject *
14880PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014881{
Victor Stinnera47082312012-10-04 02:19:54 +020014882 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014883
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014885 PyErr_BadInternalCall();
14886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014887 }
Victor Stinnera47082312012-10-04 02:19:54 +020014888
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014889 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014890 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014891
14892 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014893 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14894 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14895 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14896 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014897
Victor Stinner8f674cc2013-04-17 23:02:17 +020014898 _PyUnicodeWriter_Init(&ctx.writer);
14899 ctx.writer.min_length = ctx.fmtcnt + 100;
14900 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014901
Guido van Rossumd57fd912000-03-10 22:53:23 +000014902 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014903 ctx.arglen = PyTuple_Size(args);
14904 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014905 }
14906 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ctx.arglen = -1;
14908 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014909 }
Victor Stinnera47082312012-10-04 02:19:54 +020014910 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014911 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014912 ctx.dict = args;
14913 else
14914 ctx.dict = NULL;
14915 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014916
Victor Stinnera47082312012-10-04 02:19:54 +020014917 while (--ctx.fmtcnt >= 0) {
14918 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014919 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014920
14921 nonfmtpos = ctx.fmtpos++;
14922 while (ctx.fmtcnt >= 0 &&
14923 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14924 ctx.fmtpos++;
14925 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014926 }
Victor Stinnera47082312012-10-04 02:19:54 +020014927 if (ctx.fmtcnt < 0) {
14928 ctx.fmtpos--;
14929 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014930 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014931
Victor Stinnercfc4c132013-04-03 01:48:39 +020014932 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14933 nonfmtpos, ctx.fmtpos) < 0)
14934 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014935 }
14936 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014937 ctx.fmtpos++;
14938 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014939 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014940 }
14941 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014942
Victor Stinnera47082312012-10-04 02:19:54 +020014943 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014944 PyErr_SetString(PyExc_TypeError,
14945 "not all arguments converted during string formatting");
14946 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014947 }
14948
Victor Stinnera47082312012-10-04 02:19:54 +020014949 if (ctx.args_owned) {
14950 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951 }
Victor Stinnera47082312012-10-04 02:19:54 +020014952 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953
Benjamin Peterson29060642009-01-31 22:14:21 +000014954 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014955 _PyUnicodeWriter_Dealloc(&ctx.writer);
14956 if (ctx.args_owned) {
14957 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958 }
14959 return NULL;
14960}
14961
Jeremy Hylton938ace62002-07-17 16:30:39 +000014962static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014963unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14964
Tim Peters6d6c1a32001-08-02 04:15:00 +000014965static PyObject *
14966unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14967{
Benjamin Peterson29060642009-01-31 22:14:21 +000014968 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 static char *kwlist[] = {"object", "encoding", "errors", 0};
14970 char *encoding = NULL;
14971 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014972
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 if (type != &PyUnicode_Type)
14974 return unicode_subtype_new(type, args, kwds);
14975 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014976 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 return NULL;
14978 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014979 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 if (encoding == NULL && errors == NULL)
14981 return PyObject_Str(x);
14982 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014983 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014984}
14985
Guido van Rossume023fe02001-08-30 03:12:59 +000014986static PyObject *
14987unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14988{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014989 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014990 Py_ssize_t length, char_size;
14991 int share_wstr, share_utf8;
14992 unsigned int kind;
14993 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014994
Benjamin Peterson14339b62009-01-31 16:36:08 +000014995 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014996
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014997 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014998 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015000 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015001 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015002 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015003 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015004 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015005
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015006 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015007 if (self == NULL) {
15008 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015009 return NULL;
15010 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015011 kind = PyUnicode_KIND(unicode);
15012 length = PyUnicode_GET_LENGTH(unicode);
15013
15014 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015015#ifdef Py_DEBUG
15016 _PyUnicode_HASH(self) = -1;
15017#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015018 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015019#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015020 _PyUnicode_STATE(self).interned = 0;
15021 _PyUnicode_STATE(self).kind = kind;
15022 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015023 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015024 _PyUnicode_STATE(self).ready = 1;
15025 _PyUnicode_WSTR(self) = NULL;
15026 _PyUnicode_UTF8_LENGTH(self) = 0;
15027 _PyUnicode_UTF8(self) = NULL;
15028 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015029 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015030
15031 share_utf8 = 0;
15032 share_wstr = 0;
15033 if (kind == PyUnicode_1BYTE_KIND) {
15034 char_size = 1;
15035 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15036 share_utf8 = 1;
15037 }
15038 else if (kind == PyUnicode_2BYTE_KIND) {
15039 char_size = 2;
15040 if (sizeof(wchar_t) == 2)
15041 share_wstr = 1;
15042 }
15043 else {
15044 assert(kind == PyUnicode_4BYTE_KIND);
15045 char_size = 4;
15046 if (sizeof(wchar_t) == 4)
15047 share_wstr = 1;
15048 }
15049
15050 /* Ensure we won't overflow the length. */
15051 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15052 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015053 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 data = PyObject_MALLOC((length + 1) * char_size);
15056 if (data == NULL) {
15057 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015058 goto onError;
15059 }
15060
Victor Stinnerc3c74152011-10-02 20:39:55 +020015061 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 if (share_utf8) {
15063 _PyUnicode_UTF8_LENGTH(self) = length;
15064 _PyUnicode_UTF8(self) = data;
15065 }
15066 if (share_wstr) {
15067 _PyUnicode_WSTR_LENGTH(self) = length;
15068 _PyUnicode_WSTR(self) = (wchar_t *)data;
15069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015070
Christian Heimesf051e432016-09-13 20:22:02 +020015071 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015072 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015073 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015074#ifdef Py_DEBUG
15075 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15076#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015077 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015078 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079
15080onError:
15081 Py_DECREF(unicode);
15082 Py_DECREF(self);
15083 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015084}
15085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015086PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015087"str(object='') -> str\n\
15088str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015089\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015090Create a new string object from the given object. If encoding or\n\
15091errors is specified, then the object must expose a data buffer\n\
15092that will be decoded using the given encoding and error handler.\n\
15093Otherwise, returns the result of object.__str__() (if defined)\n\
15094or repr(object).\n\
15095encoding defaults to sys.getdefaultencoding().\n\
15096errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015097
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015098static PyObject *unicode_iter(PyObject *seq);
15099
Guido van Rossumd57fd912000-03-10 22:53:23 +000015100PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015101 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015102 "str", /* tp_name */
15103 sizeof(PyUnicodeObject), /* tp_basicsize */
15104 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015105 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015106 (destructor)unicode_dealloc, /* tp_dealloc */
15107 0, /* tp_print */
15108 0, /* tp_getattr */
15109 0, /* tp_setattr */
15110 0, /* tp_reserved */
15111 unicode_repr, /* tp_repr */
15112 &unicode_as_number, /* tp_as_number */
15113 &unicode_as_sequence, /* tp_as_sequence */
15114 &unicode_as_mapping, /* tp_as_mapping */
15115 (hashfunc) unicode_hash, /* tp_hash*/
15116 0, /* tp_call*/
15117 (reprfunc) unicode_str, /* tp_str */
15118 PyObject_GenericGetAttr, /* tp_getattro */
15119 0, /* tp_setattro */
15120 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015122 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15123 unicode_doc, /* tp_doc */
15124 0, /* tp_traverse */
15125 0, /* tp_clear */
15126 PyUnicode_RichCompare, /* tp_richcompare */
15127 0, /* tp_weaklistoffset */
15128 unicode_iter, /* tp_iter */
15129 0, /* tp_iternext */
15130 unicode_methods, /* tp_methods */
15131 0, /* tp_members */
15132 0, /* tp_getset */
15133 &PyBaseObject_Type, /* tp_base */
15134 0, /* tp_dict */
15135 0, /* tp_descr_get */
15136 0, /* tp_descr_set */
15137 0, /* tp_dictoffset */
15138 0, /* tp_init */
15139 0, /* tp_alloc */
15140 unicode_new, /* tp_new */
15141 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015142};
15143
15144/* Initialize the Unicode implementation */
15145
Victor Stinner3a50e702011-10-18 21:21:00 +020015146int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015147{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015148 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015149 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015150 0x000A, /* LINE FEED */
15151 0x000D, /* CARRIAGE RETURN */
15152 0x001C, /* FILE SEPARATOR */
15153 0x001D, /* GROUP SEPARATOR */
15154 0x001E, /* RECORD SEPARATOR */
15155 0x0085, /* NEXT LINE */
15156 0x2028, /* LINE SEPARATOR */
15157 0x2029, /* PARAGRAPH SEPARATOR */
15158 };
15159
Fred Drakee4315f52000-05-09 19:53:39 +000015160 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015161 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015162 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015163 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015164 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015165
Guido van Rossumcacfc072002-05-24 19:01:59 +000015166 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015167 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015168
15169 /* initialize the linebreak bloom filter */
15170 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015171 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015172 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015173
Christian Heimes26532f72013-07-20 14:57:16 +020015174 if (PyType_Ready(&EncodingMapType) < 0)
15175 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015176
Benjamin Petersonc4311282012-10-30 23:21:10 -040015177 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15178 Py_FatalError("Can't initialize field name iterator type");
15179
15180 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15181 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015182
Victor Stinner3a50e702011-10-18 21:21:00 +020015183 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015184}
15185
15186/* Finalize the Unicode implementation */
15187
Christian Heimesa156e092008-02-16 07:38:31 +000015188int
15189PyUnicode_ClearFreeList(void)
15190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015191 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015192}
15193
Guido van Rossumd57fd912000-03-10 22:53:23 +000015194void
Thomas Wouters78890102000-07-22 19:25:51 +000015195_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015196{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015197 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015198
Serhiy Storchaka05997252013-01-26 12:14:02 +020015199 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015200
Serhiy Storchaka05997252013-01-26 12:14:02 +020015201 for (i = 0; i < 256; i++)
15202 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015203 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015204 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015205}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015206
Walter Dörwald16807132007-05-25 13:52:07 +000015207void
15208PyUnicode_InternInPlace(PyObject **p)
15209{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015210 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015212#ifdef Py_DEBUG
15213 assert(s != NULL);
15214 assert(_PyUnicode_CHECK(s));
15215#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015217 return;
15218#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 /* If it's a subclass, we don't really know what putting
15220 it in the interned dict might do. */
15221 if (!PyUnicode_CheckExact(s))
15222 return;
15223 if (PyUnicode_CHECK_INTERNED(s))
15224 return;
15225 if (interned == NULL) {
15226 interned = PyDict_New();
15227 if (interned == NULL) {
15228 PyErr_Clear(); /* Don't leave an exception */
15229 return;
15230 }
15231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015233 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015234 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015235 if (t == NULL) {
15236 PyErr_Clear();
15237 return;
15238 }
15239 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015240 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015241 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015242 return;
15243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 /* The two references in interned are not counted by refcnt.
15245 The deallocator will take care of this */
15246 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015247 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015248}
15249
15250void
15251PyUnicode_InternImmortal(PyObject **p)
15252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 PyUnicode_InternInPlace(p);
15254 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015255 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 Py_INCREF(*p);
15257 }
Walter Dörwald16807132007-05-25 13:52:07 +000015258}
15259
15260PyObject *
15261PyUnicode_InternFromString(const char *cp)
15262{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 PyObject *s = PyUnicode_FromString(cp);
15264 if (s == NULL)
15265 return NULL;
15266 PyUnicode_InternInPlace(&s);
15267 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015268}
15269
Alexander Belopolsky40018472011-02-26 01:02:56 +000015270void
15271_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015274 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 Py_ssize_t i, n;
15276 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015277
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 if (interned == NULL || !PyDict_Check(interned))
15279 return;
15280 keys = PyDict_Keys(interned);
15281 if (keys == NULL || !PyList_Check(keys)) {
15282 PyErr_Clear();
15283 return;
15284 }
Walter Dörwald16807132007-05-25 13:52:07 +000015285
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15287 detector, interned unicode strings are not forcibly deallocated;
15288 rather, we give them their stolen references back, and then clear
15289 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015290
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 n = PyList_GET_SIZE(keys);
15292 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015293 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015295 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015296 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015297 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015299 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 case SSTATE_NOT_INTERNED:
15301 /* XXX Shouldn't happen */
15302 break;
15303 case SSTATE_INTERNED_IMMORTAL:
15304 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015305 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 break;
15307 case SSTATE_INTERNED_MORTAL:
15308 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015309 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 break;
15311 default:
15312 Py_FatalError("Inconsistent interned string state.");
15313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015314 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 }
15316 fprintf(stderr, "total size of all interned strings: "
15317 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15318 "mortal/immortal\n", mortal_size, immortal_size);
15319 Py_DECREF(keys);
15320 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015321 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015322}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015323
15324
15325/********************* Unicode Iterator **************************/
15326
15327typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 PyObject_HEAD
15329 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015330 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015331} unicodeiterobject;
15332
15333static void
15334unicodeiter_dealloc(unicodeiterobject *it)
15335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 _PyObject_GC_UNTRACK(it);
15337 Py_XDECREF(it->it_seq);
15338 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015339}
15340
15341static int
15342unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15343{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 Py_VISIT(it->it_seq);
15345 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015346}
15347
15348static PyObject *
15349unicodeiter_next(unicodeiterobject *it)
15350{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015351 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015352
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 assert(it != NULL);
15354 seq = it->it_seq;
15355 if (seq == NULL)
15356 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015357 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015359 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15360 int kind = PyUnicode_KIND(seq);
15361 void *data = PyUnicode_DATA(seq);
15362 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15363 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 if (item != NULL)
15365 ++it->it_index;
15366 return item;
15367 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015370 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372}
15373
15374static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015375unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015376{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 Py_ssize_t len = 0;
15378 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015379 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015381}
15382
15383PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15384
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015385static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015386unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015387{
15388 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015389 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015390 it->it_seq, it->it_index);
15391 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015392 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015393 if (u == NULL)
15394 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015395 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015396 }
15397}
15398
15399PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15400
15401static PyObject *
15402unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15403{
15404 Py_ssize_t index = PyLong_AsSsize_t(state);
15405 if (index == -1 && PyErr_Occurred())
15406 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015407 if (it->it_seq != NULL) {
15408 if (index < 0)
15409 index = 0;
15410 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15411 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15412 it->it_index = index;
15413 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015414 Py_RETURN_NONE;
15415}
15416
15417PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15418
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015421 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015422 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15423 reduce_doc},
15424 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15425 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427};
15428
15429PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15431 "str_iterator", /* tp_name */
15432 sizeof(unicodeiterobject), /* tp_basicsize */
15433 0, /* tp_itemsize */
15434 /* methods */
15435 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15436 0, /* tp_print */
15437 0, /* tp_getattr */
15438 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015439 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 0, /* tp_repr */
15441 0, /* tp_as_number */
15442 0, /* tp_as_sequence */
15443 0, /* tp_as_mapping */
15444 0, /* tp_hash */
15445 0, /* tp_call */
15446 0, /* tp_str */
15447 PyObject_GenericGetAttr, /* tp_getattro */
15448 0, /* tp_setattro */
15449 0, /* tp_as_buffer */
15450 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15451 0, /* tp_doc */
15452 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15453 0, /* tp_clear */
15454 0, /* tp_richcompare */
15455 0, /* tp_weaklistoffset */
15456 PyObject_SelfIter, /* tp_iter */
15457 (iternextfunc)unicodeiter_next, /* tp_iternext */
15458 unicodeiter_methods, /* tp_methods */
15459 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015460};
15461
15462static PyObject *
15463unicode_iter(PyObject *seq)
15464{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015466
Benjamin Peterson14339b62009-01-31 16:36:08 +000015467 if (!PyUnicode_Check(seq)) {
15468 PyErr_BadInternalCall();
15469 return NULL;
15470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015471 if (PyUnicode_READY(seq) == -1)
15472 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015473 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15474 if (it == NULL)
15475 return NULL;
15476 it->it_index = 0;
15477 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015478 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015479 _PyObject_GC_TRACK(it);
15480 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015481}
15482
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015483
15484size_t
15485Py_UNICODE_strlen(const Py_UNICODE *u)
15486{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015487 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015488}
15489
15490Py_UNICODE*
15491Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15492{
15493 Py_UNICODE *u = s1;
15494 while ((*u++ = *s2++));
15495 return s1;
15496}
15497
15498Py_UNICODE*
15499Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15500{
15501 Py_UNICODE *u = s1;
15502 while ((*u++ = *s2++))
15503 if (n-- == 0)
15504 break;
15505 return s1;
15506}
15507
15508Py_UNICODE*
15509Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15510{
15511 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015512 u1 += wcslen(u1);
15513 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015514 return s1;
15515}
15516
15517int
15518Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15519{
15520 while (*s1 && *s2 && *s1 == *s2)
15521 s1++, s2++;
15522 if (*s1 && *s2)
15523 return (*s1 < *s2) ? -1 : +1;
15524 if (*s1)
15525 return 1;
15526 if (*s2)
15527 return -1;
15528 return 0;
15529}
15530
15531int
15532Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15533{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015534 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015535 for (; n != 0; n--) {
15536 u1 = *s1;
15537 u2 = *s2;
15538 if (u1 != u2)
15539 return (u1 < u2) ? -1 : +1;
15540 if (u1 == '\0')
15541 return 0;
15542 s1++;
15543 s2++;
15544 }
15545 return 0;
15546}
15547
15548Py_UNICODE*
15549Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15550{
15551 const Py_UNICODE *p;
15552 for (p = s; *p; p++)
15553 if (*p == c)
15554 return (Py_UNICODE*)p;
15555 return NULL;
15556}
15557
15558Py_UNICODE*
15559Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15560{
15561 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015562 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015563 while (p != s) {
15564 p--;
15565 if (*p == c)
15566 return (Py_UNICODE*)p;
15567 }
15568 return NULL;
15569}
Victor Stinner331ea922010-08-10 16:37:20 +000015570
Victor Stinner71133ff2010-09-01 23:43:53 +000015571Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015572PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015573{
Victor Stinner577db2c2011-10-11 22:12:48 +020015574 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015575 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015577 if (!PyUnicode_Check(unicode)) {
15578 PyErr_BadArgument();
15579 return NULL;
15580 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015581 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015582 if (u == NULL)
15583 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015584 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015585 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015586 PyErr_NoMemory();
15587 return NULL;
15588 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015589 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015590 size *= sizeof(Py_UNICODE);
15591 copy = PyMem_Malloc(size);
15592 if (copy == NULL) {
15593 PyErr_NoMemory();
15594 return NULL;
15595 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015596 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015597 return copy;
15598}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015599
Georg Brandl66c221e2010-10-14 07:04:07 +000015600/* A _string module, to export formatter_parser and formatter_field_name_split
15601 to the string.Formatter class implemented in Python. */
15602
15603static PyMethodDef _string_methods[] = {
15604 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15605 METH_O, PyDoc_STR("split the argument as a field name")},
15606 {"formatter_parser", (PyCFunction) formatter_parser,
15607 METH_O, PyDoc_STR("parse the argument as a format string")},
15608 {NULL, NULL}
15609};
15610
15611static struct PyModuleDef _string_module = {
15612 PyModuleDef_HEAD_INIT,
15613 "_string",
15614 PyDoc_STR("string helper module"),
15615 0,
15616 _string_methods,
15617 NULL,
15618 NULL,
15619 NULL,
15620 NULL
15621};
15622
15623PyMODINIT_FUNC
15624PyInit__string(void)
15625{
15626 return PyModule_Create(&_string_module);
15627}
15628
15629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015630#ifdef __cplusplus
15631}
15632#endif