blob: d22b277a51c8a77a9b37220b3b32329c13052562 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700224static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200225_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
226
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200228static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200229
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230/* Single character Unicode strings in the Latin-1 range are being
231 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200232static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Christian Heimes190d79e2008-01-30 11:58:22 +0000234/* Fast detection of the most frequent whitespace characters */
235const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000239/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000240/* case 0x000C: * FORM FEED */
241/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000242 0, 1, 1, 1, 1, 1, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000244/* case 0x001C: * FILE SEPARATOR */
245/* case 0x001D: * GROUP SEPARATOR */
246/* case 0x001E: * RECORD SEPARATOR */
247/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000250 1, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000254
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000263};
264
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200266static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200267static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100268static int unicode_modifiable(PyObject *unicode);
269
Victor Stinnerfe226c02011-10-03 03:52:20 +0200270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100272_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200273static PyObject *
274_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
275static PyObject *
276_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
277
278static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000279unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100281 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000282 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
283
Alexander Belopolsky40018472011-02-26 01:02:56 +0000284static void
285raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300286 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100287 PyObject *unicode,
288 Py_ssize_t startpos, Py_ssize_t endpos,
289 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000290
Christian Heimes190d79e2008-01-30 11:58:22 +0000291/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200292static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295/* 0x000B, * LINE TABULATION */
296/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000298 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* 0x001C, * FILE SEPARATOR */
301/* 0x001D, * GROUP SEPARATOR */
302/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 0, 0, 0, 0, 1, 1, 1, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000308
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000317};
318
INADA Naoki3ae20562017-01-16 20:41:20 +0900319static int convert_uc(PyObject *obj, void *addr);
320
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300321#include "clinic/unicodeobject.c.h"
322
Victor Stinner3d4226a2018-08-29 22:21:32 +0200323_Py_error_handler
324_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200325{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200326 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200327 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200328 }
329 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200330 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200331 }
332 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200333 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200334 }
335 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200336 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200339 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200342 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200345 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
Victor Stinner50149202015-09-22 00:26:54 +0200347 return _Py_ERROR_OTHER;
348}
349
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300350/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
351 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000352Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000353PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000355#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000357#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 /* This is actually an illegal character, so it should
359 not be passed to unichr. */
360 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361#endif
362}
363
Victor Stinner910337b2011-10-03 03:20:16 +0200364#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200365int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100366_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200367{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200368#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
369
Victor Stinner910337b2011-10-03 03:20:16 +0200370 PyASCIIObject *ascii;
371 unsigned int kind;
372
Victor Stinner50fe3f82018-10-26 18:47:15 +0200373 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200374
375 ascii = (PyASCIIObject *)op;
376 kind = ascii->state.kind;
377
Victor Stinnera3b334d2011-10-03 13:53:37 +0200378 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200379 ASSERT(kind == PyUnicode_1BYTE_KIND);
380 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200383 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200385
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 if (ascii->state.compact == 1) {
387 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200388 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200389 || kind == PyUnicode_2BYTE_KIND
390 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200391 ASSERT(ascii->state.ascii == 0);
392 ASSERT(ascii->state.ready == 1);
393 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100394 }
395 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200396 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
397
398 data = unicode->data.any;
399 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200400 ASSERT(ascii->length == 0);
401 ASSERT(ascii->hash == -1);
402 ASSERT(ascii->state.compact == 0);
403 ASSERT(ascii->state.ascii == 0);
404 ASSERT(ascii->state.ready == 0);
405 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
406 ASSERT(ascii->wstr != NULL);
407 ASSERT(data == NULL);
408 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 }
410 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200411 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200412 || kind == PyUnicode_2BYTE_KIND
413 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(ascii->state.compact == 0);
415 ASSERT(ascii->state.ready == 1);
416 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200418 ASSERT (compact->utf8 == data);
419 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200420 }
421 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200422 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 }
424 }
425 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200426 if (
427#if SIZEOF_WCHAR_T == 2
428 kind == PyUnicode_2BYTE_KIND
429#else
430 kind == PyUnicode_4BYTE_KIND
431#endif
432 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200433 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200434 ASSERT(ascii->wstr == data);
435 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200436 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200437 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200438 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200439
440 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200441 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200442 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200443 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200444 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 /* check that the best kind is used */
446 if (check_content && kind != PyUnicode_WCHAR_KIND)
447 {
448 Py_ssize_t i;
449 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200450 void *data;
451 Py_UCS4 ch;
452
453 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200454 for (i=0; i < ascii->length; i++)
455 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200456 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 if (ch > maxchar)
458 maxchar = ch;
459 }
460 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100461 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200462 ASSERT(maxchar >= 128);
463 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100464 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200466 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200467 }
Victor Stinner77faf692011-11-20 18:56:05 +0100468 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(maxchar >= 0x100);
470 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 }
472 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200473 ASSERT(maxchar >= 0x10000);
474 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100475 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400478 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200479
480#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400481}
Victor Stinner910337b2011-10-03 03:20:16 +0200482#endif
483
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100484static PyObject*
485unicode_result_wchar(PyObject *unicode)
486{
487#ifndef Py_DEBUG
488 Py_ssize_t len;
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490 len = _PyUnicode_WSTR_LENGTH(unicode);
491 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100492 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200493 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 }
495
496 if (len == 1) {
497 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100498 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
500 Py_DECREF(unicode);
501 return latin1_char;
502 }
503 }
504
505 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200506 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100507 return NULL;
508 }
509#else
Victor Stinneraa771272012-10-04 02:32:58 +0200510 assert(Py_REFCNT(unicode) == 1);
511
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 /* don't make the result ready in debug mode to ensure that the caller
513 makes the string ready before using it */
514 assert(_PyUnicode_CheckConsistency(unicode, 1));
515#endif
516 return unicode;
517}
518
519static PyObject*
520unicode_result_ready(PyObject *unicode)
521{
522 Py_ssize_t length;
523
524 length = PyUnicode_GET_LENGTH(unicode);
525 if (length == 0) {
526 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530 return unicode_empty;
531 }
532
533 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200534 void *data = PyUnicode_DATA(unicode);
535 int kind = PyUnicode_KIND(unicode);
536 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100537 if (ch < 256) {
538 PyObject *latin1_char = unicode_latin1[ch];
539 if (latin1_char != NULL) {
540 if (unicode != latin1_char) {
541 Py_INCREF(latin1_char);
542 Py_DECREF(unicode);
543 }
544 return latin1_char;
545 }
546 else {
547 assert(_PyUnicode_CheckConsistency(unicode, 1));
548 Py_INCREF(unicode);
549 unicode_latin1[ch] = unicode;
550 return unicode;
551 }
552 }
553 }
554
555 assert(_PyUnicode_CheckConsistency(unicode, 1));
556 return unicode;
557}
558
559static PyObject*
560unicode_result(PyObject *unicode)
561{
562 assert(_PyUnicode_CHECK(unicode));
563 if (PyUnicode_IS_READY(unicode))
564 return unicode_result_ready(unicode);
565 else
566 return unicode_result_wchar(unicode);
567}
568
Victor Stinnerc4b49542011-12-11 22:44:26 +0100569static PyObject*
570unicode_result_unchanged(PyObject *unicode)
571{
572 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500573 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574 return NULL;
575 Py_INCREF(unicode);
576 return unicode;
577 }
578 else
579 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100580 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100581}
582
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200583/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
584 ASCII, Latin1, UTF-8, etc. */
585static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200586backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200587 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
588{
Victor Stinnerad771582015-10-09 12:38:53 +0200589 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200590 Py_UCS4 ch;
591 enum PyUnicode_Kind kind;
592 void *data;
593
594 assert(PyUnicode_IS_READY(unicode));
595 kind = PyUnicode_KIND(unicode);
596 data = PyUnicode_DATA(unicode);
597
598 size = 0;
599 /* determine replacement size */
600 for (i = collstart; i < collend; ++i) {
601 Py_ssize_t incr;
602
603 ch = PyUnicode_READ(kind, data, i);
604 if (ch < 0x100)
605 incr = 2+2;
606 else if (ch < 0x10000)
607 incr = 2+4;
608 else {
609 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200610 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200611 }
612 if (size > PY_SSIZE_T_MAX - incr) {
613 PyErr_SetString(PyExc_OverflowError,
614 "encoded result is too long for a Python string");
615 return NULL;
616 }
617 size += incr;
618 }
619
Victor Stinnerad771582015-10-09 12:38:53 +0200620 str = _PyBytesWriter_Prepare(writer, str, size);
621 if (str == NULL)
622 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200623
624 /* generate replacement */
625 for (i = collstart; i < collend; ++i) {
626 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200627 *str++ = '\\';
628 if (ch >= 0x00010000) {
629 *str++ = 'U';
630 *str++ = Py_hexdigits[(ch>>28)&0xf];
631 *str++ = Py_hexdigits[(ch>>24)&0xf];
632 *str++ = Py_hexdigits[(ch>>20)&0xf];
633 *str++ = Py_hexdigits[(ch>>16)&0xf];
634 *str++ = Py_hexdigits[(ch>>12)&0xf];
635 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 }
Victor Stinner797485e2015-10-09 03:17:30 +0200637 else if (ch >= 0x100) {
638 *str++ = 'u';
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
641 }
642 else
643 *str++ = 'x';
644 *str++ = Py_hexdigits[(ch>>4)&0xf];
645 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 return str;
648}
649
650/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
651 ASCII, Latin1, UTF-8, etc. */
652static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200653xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200654 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
655{
Victor Stinnerad771582015-10-09 12:38:53 +0200656 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200657 Py_UCS4 ch;
658 enum PyUnicode_Kind kind;
659 void *data;
660
661 assert(PyUnicode_IS_READY(unicode));
662 kind = PyUnicode_KIND(unicode);
663 data = PyUnicode_DATA(unicode);
664
665 size = 0;
666 /* determine replacement size */
667 for (i = collstart; i < collend; ++i) {
668 Py_ssize_t incr;
669
670 ch = PyUnicode_READ(kind, data, i);
671 if (ch < 10)
672 incr = 2+1+1;
673 else if (ch < 100)
674 incr = 2+2+1;
675 else if (ch < 1000)
676 incr = 2+3+1;
677 else if (ch < 10000)
678 incr = 2+4+1;
679 else if (ch < 100000)
680 incr = 2+5+1;
681 else if (ch < 1000000)
682 incr = 2+6+1;
683 else {
684 assert(ch <= MAX_UNICODE);
685 incr = 2+7+1;
686 }
687 if (size > PY_SSIZE_T_MAX - incr) {
688 PyErr_SetString(PyExc_OverflowError,
689 "encoded result is too long for a Python string");
690 return NULL;
691 }
692 size += incr;
693 }
694
Victor Stinnerad771582015-10-09 12:38:53 +0200695 str = _PyBytesWriter_Prepare(writer, str, size);
696 if (str == NULL)
697 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200698
699 /* generate replacement */
700 for (i = collstart; i < collend; ++i) {
701 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
702 }
703 return str;
704}
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706/* --- Bloom Filters ----------------------------------------------------- */
707
708/* stuff to implement simple "bloom filters" for Unicode characters.
709 to keep things simple, we use a single bitmask, using the least 5
710 bits from each unicode characters as the bit index. */
711
712/* the linebreak mask is set up by Unicode_Init below */
713
Antoine Pitrouf068f942010-01-13 14:19:12 +0000714#if LONG_BIT >= 128
715#define BLOOM_WIDTH 128
716#elif LONG_BIT >= 64
717#define BLOOM_WIDTH 64
718#elif LONG_BIT >= 32
719#define BLOOM_WIDTH 32
720#else
721#error "LONG_BIT is smaller than 32"
722#endif
723
Thomas Wouters477c8d52006-05-27 19:21:47 +0000724#define BLOOM_MASK unsigned long
725
Serhiy Storchaka05997252013-01-26 12:14:02 +0200726static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000727
Antoine Pitrouf068f942010-01-13 14:19:12 +0000728#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729
Benjamin Peterson29060642009-01-31 22:14:21 +0000730#define BLOOM_LINEBREAK(ch) \
731 ((ch) < 128U ? ascii_linebreak[(ch)] : \
732 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700734static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000736{
Victor Stinnera85af502013-04-09 21:53:54 +0200737#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
738 do { \
739 TYPE *data = (TYPE *)PTR; \
740 TYPE *end = data + LEN; \
741 Py_UCS4 ch; \
742 for (; data != end; data++) { \
743 ch = *data; \
744 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
745 } \
746 break; \
747 } while (0)
748
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 /* calculate simple bloom-style bitmask for a given unicode string */
750
Antoine Pitrouf068f942010-01-13 14:19:12 +0000751 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752
753 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200754 switch (kind) {
755 case PyUnicode_1BYTE_KIND:
756 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
757 break;
758 case PyUnicode_2BYTE_KIND:
759 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
760 break;
761 case PyUnicode_4BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
763 break;
764 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700765 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200766 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000767 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200768
769#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000770}
771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300772static int
773ensure_unicode(PyObject *obj)
774{
775 if (!PyUnicode_Check(obj)) {
776 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200777 "must be str, not %.100s",
778 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300779 return -1;
780 }
781 return PyUnicode_READY(obj);
782}
783
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200784/* Compilation of templated routines */
785
786#include "stringlib/asciilib.h"
787#include "stringlib/fastsearch.h"
788#include "stringlib/partition.h"
789#include "stringlib/split.h"
790#include "stringlib/count.h"
791#include "stringlib/find.h"
792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs1lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs2lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
818#include "stringlib/ucs4lib.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/partition.h"
821#include "stringlib/split.h"
822#include "stringlib/count.h"
823#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300824#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200825#include "stringlib/find_max_char.h"
826#include "stringlib/localeutil.h"
827#include "stringlib/undef.h"
828
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200829#include "stringlib/unicodedefs.h"
830#include "stringlib/fastsearch.h"
831#include "stringlib/count.h"
832#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100833#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835/* --- Unicode Object ----------------------------------------------------- */
836
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700837static inline Py_ssize_t
838findchar(const void *s, int kind,
839 Py_ssize_t size, Py_UCS4 ch,
840 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200842 switch (kind) {
843 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200844 if ((Py_UCS1) ch != ch)
845 return -1;
846 if (direction > 0)
847 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
848 else
849 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200851 if ((Py_UCS2) ch != ch)
852 return -1;
853 if (direction > 0)
854 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
855 else
856 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200857 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200858 if (direction > 0)
859 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
860 else
861 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200862 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700863 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865}
866
Victor Stinnerafffce42012-10-03 23:03:17 +0200867#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000868/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200869 earlier.
870
871 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
872 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
873 invalid character in Unicode 6.0. */
874static void
875unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
876{
877 int kind = PyUnicode_KIND(unicode);
878 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
879 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
880 if (length <= old_length)
881 return;
882 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
883}
884#endif
885
Victor Stinnerfe226c02011-10-03 03:52:20 +0200886static PyObject*
887resize_compact(PyObject *unicode, Py_ssize_t length)
888{
889 Py_ssize_t char_size;
890 Py_ssize_t struct_size;
891 Py_ssize_t new_size;
892 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100893 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200894#ifdef Py_DEBUG
895 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
896#endif
897
Victor Stinner79891572012-05-03 13:43:07 +0200898 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200899 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100900 assert(PyUnicode_IS_COMPACT(unicode));
901
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200902 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100903 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200904 struct_size = sizeof(PyASCIIObject);
905 else
906 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200907 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908
Victor Stinnerfe226c02011-10-03 03:52:20 +0200909 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
910 PyErr_NoMemory();
911 return NULL;
912 }
913 new_size = (struct_size + (length + 1) * char_size);
914
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200915 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
916 PyObject_DEL(_PyUnicode_UTF8(unicode));
917 _PyUnicode_UTF8(unicode) = NULL;
918 _PyUnicode_UTF8_LENGTH(unicode) = 0;
919 }
Victor Stinner84def372011-12-11 20:04:56 +0100920 _Py_DEC_REFTOTAL;
921 _Py_ForgetReference(unicode);
922
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300923 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100924 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100925 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 PyErr_NoMemory();
927 return NULL;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100931
Victor Stinnerfe226c02011-10-03 03:52:20 +0200932 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200933 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100935 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200936 _PyUnicode_WSTR_LENGTH(unicode) = length;
937 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100938 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
939 PyObject_DEL(_PyUnicode_WSTR(unicode));
940 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100941 if (!PyUnicode_IS_ASCII(unicode))
942 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100943 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200944#ifdef Py_DEBUG
945 unicode_fill_invalid(unicode, old_length);
946#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200947 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
948 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200949 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200950 return unicode;
951}
952
Alexander Belopolsky40018472011-02-26 01:02:56 +0000953static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200954resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955{
Victor Stinner95663112011-10-04 01:03:50 +0200956 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100957 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000960
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 if (PyUnicode_IS_READY(unicode)) {
962 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200963 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200964 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200965#ifdef Py_DEBUG
966 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
967#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968
969 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200970 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200971 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
972 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973
974 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
975 PyErr_NoMemory();
976 return -1;
977 }
978 new_size = (length + 1) * char_size;
979
Victor Stinner7a9105a2011-12-12 00:13:42 +0100980 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
981 {
982 PyObject_DEL(_PyUnicode_UTF8(unicode));
983 _PyUnicode_UTF8(unicode) = NULL;
984 _PyUnicode_UTF8_LENGTH(unicode) = 0;
985 }
986
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 data = (PyObject *)PyObject_REALLOC(data, new_size);
988 if (data == NULL) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200993 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200995 _PyUnicode_WSTR_LENGTH(unicode) = length;
996 }
997 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200998 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200999 _PyUnicode_UTF8_LENGTH(unicode) = length;
1000 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
1002 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001003#ifdef Py_DEBUG
1004 unicode_fill_invalid(unicode, old_length);
1005#endif
Victor Stinner95663112011-10-04 01:03:50 +02001006 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001007 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 }
Victor Stinner95663112011-10-04 01:03:50 +02001011 assert(_PyUnicode_WSTR(unicode) != NULL);
1012
1013 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001014 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001015 PyErr_NoMemory();
1016 return -1;
1017 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001018 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001019 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001020 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001021 if (!wstr) {
1022 PyErr_NoMemory();
1023 return -1;
1024 }
1025 _PyUnicode_WSTR(unicode) = wstr;
1026 _PyUnicode_WSTR(unicode)[length] = 0;
1027 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001028 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 return 0;
1030}
1031
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032static PyObject*
1033resize_copy(PyObject *unicode, Py_ssize_t length)
1034{
1035 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001036 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001038
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001039 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040
1041 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1042 if (copy == NULL)
1043 return NULL;
1044
1045 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001046 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001048 }
1049 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001050 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (w == NULL)
1054 return NULL;
1055 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1056 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001057 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001058 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 }
1061}
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001064 Ux0000 terminated; some code (e.g. new_identifier)
1065 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066
1067 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001068 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069
1070*/
1071
Alexander Belopolsky40018472011-02-26 01:02:56 +00001072static PyUnicodeObject *
1073_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001075 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077
Thomas Wouters477c8d52006-05-27 19:21:47 +00001078 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 if (length == 0 && unicode_empty != NULL) {
1080 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001081 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082 }
1083
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001084 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001085 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001086 return (PyUnicodeObject *)PyErr_NoMemory();
1087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 if (length < 0) {
1089 PyErr_SetString(PyExc_SystemError,
1090 "Negative size passed to _PyUnicode_New");
1091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
1093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1095 if (unicode == NULL)
1096 return NULL;
1097 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001098
1099 _PyUnicode_WSTR_LENGTH(unicode) = length;
1100 _PyUnicode_HASH(unicode) = -1;
1101 _PyUnicode_STATE(unicode).interned = 0;
1102 _PyUnicode_STATE(unicode).kind = 0;
1103 _PyUnicode_STATE(unicode).compact = 0;
1104 _PyUnicode_STATE(unicode).ready = 0;
1105 _PyUnicode_STATE(unicode).ascii = 0;
1106 _PyUnicode_DATA_ANY(unicode) = NULL;
1107 _PyUnicode_LENGTH(unicode) = 0;
1108 _PyUnicode_UTF8(unicode) = NULL;
1109 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1112 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001113 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001114 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001115 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117
Jeremy Hyltond8082792003-09-16 19:41:39 +00001118 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001119 * the caller fails before initializing str -- unicode_resize()
1120 * reads str[0], and the Keep-Alive optimization can keep memory
1121 * allocated for str alive across a call to unicode_dealloc(unicode).
1122 * We don't want unicode_resize to read uninitialized memory in
1123 * that case.
1124 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 _PyUnicode_WSTR(unicode)[0] = 0;
1126 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001127
Victor Stinner7931d9a2011-11-04 00:22:48 +01001128 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 return unicode;
1130}
1131
Victor Stinnerf42dc442011-10-02 23:33:16 +02001132static const char*
1133unicode_kind_name(PyObject *unicode)
1134{
Victor Stinner42dfd712011-10-03 14:41:45 +02001135 /* don't check consistency: unicode_kind_name() is called from
1136 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001137 if (!PyUnicode_IS_COMPACT(unicode))
1138 {
1139 if (!PyUnicode_IS_READY(unicode))
1140 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001141 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001142 {
1143 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001144 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001145 return "legacy ascii";
1146 else
1147 return "legacy latin1";
1148 case PyUnicode_2BYTE_KIND:
1149 return "legacy UCS2";
1150 case PyUnicode_4BYTE_KIND:
1151 return "legacy UCS4";
1152 default:
1153 return "<legacy invalid kind>";
1154 }
1155 }
1156 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001157 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 return "ascii";
1161 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001162 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001164 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001165 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001166 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 default:
1168 return "<invalid compact kind>";
1169 }
1170}
1171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173/* Functions wrapping macros for use in debugger */
1174char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001175 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176}
1177
1178void *_PyUnicode_compact_data(void *unicode) {
1179 return _PyUnicode_COMPACT_DATA(unicode);
1180}
1181void *_PyUnicode_data(void *unicode){
1182 printf("obj %p\n", unicode);
1183 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1184 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1185 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1186 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1187 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1188 return PyUnicode_DATA(unicode);
1189}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001190
1191void
1192_PyUnicode_Dump(PyObject *op)
1193{
1194 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001195 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1196 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1197 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001198
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001200 {
1201 if (ascii->state.ascii)
1202 data = (ascii + 1);
1203 else
1204 data = (compact + 1);
1205 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001206 else
1207 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001208 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1209 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001210
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (ascii->wstr == data)
1212 printf("shared ");
1213 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001214
Victor Stinnera3b334d2011-10-03 13:53:37 +02001215 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001216 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001217 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1218 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001219 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1220 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001221 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001222 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001223}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224#endif
1225
1226PyObject *
1227PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1228{
1229 PyObject *obj;
1230 PyCompactUnicodeObject *unicode;
1231 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001232 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001233 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 Py_ssize_t char_size;
1235 Py_ssize_t struct_size;
1236
1237 /* Optimization for empty strings */
1238 if (size == 0 && unicode_empty != NULL) {
1239 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001240 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 }
1242
Victor Stinner9e9d6892011-10-04 01:02:02 +02001243 is_ascii = 0;
1244 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 struct_size = sizeof(PyCompactUnicodeObject);
1246 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 is_ascii = 1;
1250 struct_size = sizeof(PyASCIIObject);
1251 }
1252 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001253 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 char_size = 1;
1255 }
1256 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001257 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 char_size = 2;
1259 if (sizeof(wchar_t) == 2)
1260 is_sharing = 1;
1261 }
1262 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001263 if (maxchar > MAX_UNICODE) {
1264 PyErr_SetString(PyExc_SystemError,
1265 "invalid maximum character passed to PyUnicode_New");
1266 return NULL;
1267 }
Victor Stinner8f825062012-04-27 13:55:39 +02001268 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 char_size = 4;
1270 if (sizeof(wchar_t) == 4)
1271 is_sharing = 1;
1272 }
1273
1274 /* Ensure we won't overflow the size. */
1275 if (size < 0) {
1276 PyErr_SetString(PyExc_SystemError,
1277 "Negative size passed to PyUnicode_New");
1278 return NULL;
1279 }
1280 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1281 return PyErr_NoMemory();
1282
1283 /* Duplicated allocation code from _PyObject_New() instead of a call to
1284 * PyObject_New() so we are able to allocate space for the object and
1285 * it's data buffer.
1286 */
1287 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1288 if (obj == NULL)
1289 return PyErr_NoMemory();
1290 obj = PyObject_INIT(obj, &PyUnicode_Type);
1291 if (obj == NULL)
1292 return NULL;
1293
1294 unicode = (PyCompactUnicodeObject *)obj;
1295 if (is_ascii)
1296 data = ((PyASCIIObject*)obj) + 1;
1297 else
1298 data = unicode + 1;
1299 _PyUnicode_LENGTH(unicode) = size;
1300 _PyUnicode_HASH(unicode) = -1;
1301 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001302 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 _PyUnicode_STATE(unicode).compact = 1;
1304 _PyUnicode_STATE(unicode).ready = 1;
1305 _PyUnicode_STATE(unicode).ascii = is_ascii;
1306 if (is_ascii) {
1307 ((char*)data)[size] = 0;
1308 _PyUnicode_WSTR(unicode) = NULL;
1309 }
Victor Stinner8f825062012-04-27 13:55:39 +02001310 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 ((char*)data)[size] = 0;
1312 _PyUnicode_WSTR(unicode) = NULL;
1313 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001315 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 else {
1318 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001319 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001320 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001322 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 ((Py_UCS4*)data)[size] = 0;
1324 if (is_sharing) {
1325 _PyUnicode_WSTR_LENGTH(unicode) = size;
1326 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1327 }
1328 else {
1329 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1330 _PyUnicode_WSTR(unicode) = NULL;
1331 }
1332 }
Victor Stinner8f825062012-04-27 13:55:39 +02001333#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001334 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001335#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001336 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 return obj;
1338}
1339
1340#if SIZEOF_WCHAR_T == 2
1341/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1342 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001343 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344
1345 This function assumes that unicode can hold one more code point than wstr
1346 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001347static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001349 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350{
1351 const wchar_t *iter;
1352 Py_UCS4 *ucs4_out;
1353
Victor Stinner910337b2011-10-03 03:20:16 +02001354 assert(unicode != NULL);
1355 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1357 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1358
1359 for (iter = begin; iter < end; ) {
1360 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1361 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001362 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1363 && (iter+1) < end
1364 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 {
Victor Stinner551ac952011-11-29 22:58:13 +01001366 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 iter += 2;
1368 }
1369 else {
1370 *ucs4_out++ = *iter;
1371 iter++;
1372 }
1373 }
1374 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1375 _PyUnicode_GET_LENGTH(unicode)));
1376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377}
1378#endif
1379
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380static int
Victor Stinner488fa492011-12-12 00:01:39 +01001381unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382{
Victor Stinner488fa492011-12-12 00:01:39 +01001383 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001384 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001385 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001386 return -1;
1387 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001388 return 0;
1389}
1390
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001391static int
1392_copy_characters(PyObject *to, Py_ssize_t to_start,
1393 PyObject *from, Py_ssize_t from_start,
1394 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001396 unsigned int from_kind, to_kind;
1397 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinneree4544c2012-05-09 22:24:08 +02001399 assert(0 <= how_many);
1400 assert(0 <= from_start);
1401 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001402 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001403 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001404 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405
Victor Stinnerd3f08822012-05-29 12:57:52 +02001406 assert(PyUnicode_Check(to));
1407 assert(PyUnicode_IS_READY(to));
1408 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1409
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001410 if (how_many == 0)
1411 return 0;
1412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001414 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001416 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
Victor Stinnerf1852262012-06-16 16:38:26 +02001418#ifdef Py_DEBUG
1419 if (!check_maxchar
1420 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1421 {
1422 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1423 Py_UCS4 ch;
1424 Py_ssize_t i;
1425 for (i=0; i < how_many; i++) {
1426 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1427 assert(ch <= to_maxchar);
1428 }
1429 }
1430#endif
1431
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001432 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001433 if (check_maxchar
1434 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1435 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 /* Writing Latin-1 characters into an ASCII string requires to
1437 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001438 Py_UCS4 max_char;
1439 max_char = ucs1lib_find_max_char(from_data,
1440 (Py_UCS1*)from_data + how_many);
1441 if (max_char >= 128)
1442 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001443 }
Christian Heimesf051e432016-09-13 20:22:02 +02001444 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001445 (char*)from_data + from_kind * from_start,
1446 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 else if (from_kind == PyUnicode_1BYTE_KIND
1449 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 {
1451 _PyUnicode_CONVERT_BYTES(
1452 Py_UCS1, Py_UCS2,
1453 PyUnicode_1BYTE_DATA(from) + from_start,
1454 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1455 PyUnicode_2BYTE_DATA(to) + to_start
1456 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001457 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001458 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 && to_kind == PyUnicode_4BYTE_KIND)
1460 {
1461 _PyUnicode_CONVERT_BYTES(
1462 Py_UCS1, Py_UCS4,
1463 PyUnicode_1BYTE_DATA(from) + from_start,
1464 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1465 PyUnicode_4BYTE_DATA(to) + to_start
1466 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001467 }
1468 else if (from_kind == PyUnicode_2BYTE_KIND
1469 && to_kind == PyUnicode_4BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS2, Py_UCS4,
1473 PyUnicode_2BYTE_DATA(from) + from_start,
1474 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_4BYTE_DATA(to) + to_start
1476 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001477 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001478 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001479 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1480
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001481 if (!check_maxchar) {
1482 if (from_kind == PyUnicode_2BYTE_KIND
1483 && to_kind == PyUnicode_1BYTE_KIND)
1484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS2, Py_UCS1,
1487 PyUnicode_2BYTE_DATA(from) + from_start,
1488 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_1BYTE_DATA(to) + to_start
1490 );
1491 }
1492 else if (from_kind == PyUnicode_4BYTE_KIND
1493 && to_kind == PyUnicode_1BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS4, Py_UCS1,
1497 PyUnicode_4BYTE_DATA(from) + from_start,
1498 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_1BYTE_DATA(to) + to_start
1500 );
1501 }
1502 else if (from_kind == PyUnicode_4BYTE_KIND
1503 && to_kind == PyUnicode_2BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS4, Py_UCS2,
1507 PyUnicode_4BYTE_DATA(from) + from_start,
1508 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_2BYTE_DATA(to) + to_start
1510 );
1511 }
1512 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001513 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 }
1515 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001516 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001517 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 Py_ssize_t i;
1520
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 for (i=0; i < how_many; i++) {
1522 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001523 if (ch > to_maxchar)
1524 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001525 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1526 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 }
1528 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 return 0;
1530}
1531
Victor Stinnerd3f08822012-05-29 12:57:52 +02001532void
1533_PyUnicode_FastCopyCharacters(
1534 PyObject *to, Py_ssize_t to_start,
1535 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001536{
1537 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1538}
1539
1540Py_ssize_t
1541PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1542 PyObject *from, Py_ssize_t from_start,
1543 Py_ssize_t how_many)
1544{
1545 int err;
1546
1547 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1548 PyErr_BadInternalCall();
1549 return -1;
1550 }
1551
Benjamin Petersonbac79492012-01-14 13:34:47 -05001552 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001554 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001555 return -1;
1556
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001557 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001558 PyErr_SetString(PyExc_IndexError, "string index out of range");
1559 return -1;
1560 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001561 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001562 PyErr_SetString(PyExc_IndexError, "string index out of range");
1563 return -1;
1564 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001565 if (how_many < 0) {
1566 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1567 return -1;
1568 }
1569 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1571 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001572 "Cannot write %zi characters at %zi "
1573 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001574 how_many, to_start, PyUnicode_GET_LENGTH(to));
1575 return -1;
1576 }
1577
1578 if (how_many == 0)
1579 return 0;
1580
Victor Stinner488fa492011-12-12 00:01:39 +01001581 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001582 return -1;
1583
1584 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1585 if (err) {
1586 PyErr_Format(PyExc_SystemError,
1587 "Cannot copy %s characters "
1588 "into a string of %s characters",
1589 unicode_kind_name(from),
1590 unicode_kind_name(to));
1591 return -1;
1592 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594}
1595
Victor Stinner17222162011-09-28 22:15:37 +02001596/* Find the maximum code point and count the number of surrogate pairs so a
1597 correct string length can be computed before converting a string to UCS4.
1598 This function counts single surrogates as a character and not as a pair.
1599
1600 Return 0 on success, or -1 on error. */
1601static int
1602find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1603 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604{
1605 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001606 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607
Victor Stinnerc53be962011-10-02 21:33:54 +02001608 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 *num_surrogates = 0;
1610 *maxchar = 0;
1611
1612 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001614 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1615 && (iter+1) < end
1616 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1617 {
1618 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1619 ++(*num_surrogates);
1620 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 }
1622 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001624 {
1625 ch = *iter;
1626 iter++;
1627 }
1628 if (ch > *maxchar) {
1629 *maxchar = ch;
1630 if (*maxchar > MAX_UNICODE) {
1631 PyErr_Format(PyExc_ValueError,
1632 "character U+%x is not in range [U+0000; U+10ffff]",
1633 ch);
1634 return -1;
1635 }
1636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001637 }
1638 return 0;
1639}
1640
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001641int
1642_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643{
1644 wchar_t *end;
1645 Py_UCS4 maxchar = 0;
1646 Py_ssize_t num_surrogates;
1647#if SIZEOF_WCHAR_T == 2
1648 Py_ssize_t length_wo_surrogates;
1649#endif
1650
Georg Brandl7597add2011-10-05 16:36:47 +02001651 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001652 strings were created using _PyObject_New() and where no canonical
1653 representation (the str field) has been set yet aka strings
1654 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001655 assert(_PyUnicode_CHECK(unicode));
1656 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001658 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001659 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001660 /* Actually, it should neither be interned nor be anything else: */
1661 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001664 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001665 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667
1668 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001669 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1670 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 PyErr_NoMemory();
1672 return -1;
1673 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001674 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 _PyUnicode_WSTR(unicode), end,
1676 PyUnicode_1BYTE_DATA(unicode));
1677 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1678 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1679 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1680 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001681 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001682 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001683 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 }
1685 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001686 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001687 _PyUnicode_UTF8(unicode) = NULL;
1688 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 }
1690 PyObject_FREE(_PyUnicode_WSTR(unicode));
1691 _PyUnicode_WSTR(unicode) = NULL;
1692 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1693 }
1694 /* In this case we might have to convert down from 4-byte native
1695 wchar_t to 2-byte unicode. */
1696 else if (maxchar < 65536) {
1697 assert(num_surrogates == 0 &&
1698 "FindMaxCharAndNumSurrogatePairs() messed up");
1699
Victor Stinner506f5922011-09-28 22:34:18 +02001700#if SIZEOF_WCHAR_T == 2
1701 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1704 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1705 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001706 _PyUnicode_UTF8(unicode) = NULL;
1707 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001708#else
1709 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001710 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001711 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyErr_NoMemory();
1714 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 }
Victor Stinner506f5922011-09-28 22:34:18 +02001716 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1717 _PyUnicode_WSTR(unicode), end,
1718 PyUnicode_2BYTE_DATA(unicode));
1719 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1720 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1721 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 }
1729 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1730 else {
1731#if SIZEOF_WCHAR_T == 2
1732 /* in case the native representation is 2-bytes, we need to allocate a
1733 new normalized 4-byte version. */
1734 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001735 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1736 PyErr_NoMemory();
1737 return -1;
1738 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001739 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1740 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyErr_NoMemory();
1742 return -1;
1743 }
1744 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1745 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001746 _PyUnicode_UTF8(unicode) = NULL;
1747 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001748 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1749 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001750 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyObject_FREE(_PyUnicode_WSTR(unicode));
1752 _PyUnicode_WSTR(unicode) = NULL;
1753 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1754#else
1755 assert(num_surrogates == 0);
1756
Victor Stinnerc3c74152011-10-02 20:39:55 +02001757 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001759 _PyUnicode_UTF8(unicode) = NULL;
1760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1762#endif
1763 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1764 }
1765 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001766 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 return 0;
1768}
1769
Alexander Belopolsky40018472011-02-26 01:02:56 +00001770static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001771unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772{
Walter Dörwald16807132007-05-25 13:52:07 +00001773 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001774 case SSTATE_NOT_INTERNED:
1775 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001776
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 case SSTATE_INTERNED_MORTAL:
1778 /* revive dead object temporarily for DelItem */
1779 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001780 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 Py_FatalError(
1782 "deletion of interned string failed");
1783 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001784
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 case SSTATE_INTERNED_IMMORTAL:
1786 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001787 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001788
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 default:
1790 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001791 }
1792
Victor Stinner03490912011-10-03 23:45:12 +02001793 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001795 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001796 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001797 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1798 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001800 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801}
1802
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001803#ifdef Py_DEBUG
1804static int
1805unicode_is_singleton(PyObject *unicode)
1806{
1807 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1808 if (unicode == unicode_empty)
1809 return 1;
1810 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1811 {
1812 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1813 if (ch < 256 && unicode_latin1[ch] == unicode)
1814 return 1;
1815 }
1816 return 0;
1817}
1818#endif
1819
Alexander Belopolsky40018472011-02-26 01:02:56 +00001820static int
Victor Stinner488fa492011-12-12 00:01:39 +01001821unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001822{
Victor Stinner488fa492011-12-12 00:01:39 +01001823 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001824 if (Py_REFCNT(unicode) != 1)
1825 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001826 if (_PyUnicode_HASH(unicode) != -1)
1827 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828 if (PyUnicode_CHECK_INTERNED(unicode))
1829 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001830 if (!PyUnicode_CheckExact(unicode))
1831 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001832#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001833 /* singleton refcount is greater than 1 */
1834 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001835#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001836 return 1;
1837}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001838
Victor Stinnerfe226c02011-10-03 03:52:20 +02001839static int
1840unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1841{
1842 PyObject *unicode;
1843 Py_ssize_t old_length;
1844
1845 assert(p_unicode != NULL);
1846 unicode = *p_unicode;
1847
1848 assert(unicode != NULL);
1849 assert(PyUnicode_Check(unicode));
1850 assert(0 <= length);
1851
Victor Stinner910337b2011-10-03 03:20:16 +02001852 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001853 old_length = PyUnicode_WSTR_LENGTH(unicode);
1854 else
1855 old_length = PyUnicode_GET_LENGTH(unicode);
1856 if (old_length == length)
1857 return 0;
1858
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001859 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001860 _Py_INCREF_UNICODE_EMPTY();
1861 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001863 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001864 return 0;
1865 }
1866
Victor Stinner488fa492011-12-12 00:01:39 +01001867 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 PyObject *copy = resize_copy(unicode, length);
1869 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001870 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001871 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873 }
1874
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001876 PyObject *new_unicode = resize_compact(unicode, length);
1877 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001879 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001882 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001883}
1884
Alexander Belopolsky40018472011-02-26 01:02:56 +00001885int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001886PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001887{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 PyObject *unicode;
1889 if (p_unicode == NULL) {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001894 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895 {
1896 PyErr_BadInternalCall();
1897 return -1;
1898 }
1899 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001900}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001901
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001902/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001904 WARNING: The function doesn't copy the terminating null character and
1905 doesn't check the maximum character (may write a latin1 character in an
1906 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001907static void
1908unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1909 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001910{
1911 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1912 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001913 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001914
1915 switch (kind) {
1916 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001918#ifdef Py_DEBUG
1919 if (PyUnicode_IS_ASCII(unicode)) {
1920 Py_UCS4 maxchar = ucs1lib_find_max_char(
1921 (const Py_UCS1*)str,
1922 (const Py_UCS1*)str + len);
1923 assert(maxchar < 128);
1924 }
1925#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001926 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 case PyUnicode_2BYTE_KIND: {
1930 Py_UCS2 *start = (Py_UCS2 *)data + index;
1931 Py_UCS2 *ucs2 = start;
1932 assert(index <= PyUnicode_GET_LENGTH(unicode));
1933
Victor Stinner184252a2012-06-16 02:57:41 +02001934 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001935 *ucs2 = (Py_UCS2)*str;
1936
1937 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001938 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 default: {
1941 Py_UCS4 *start = (Py_UCS4 *)data + index;
1942 Py_UCS4 *ucs4 = start;
1943 assert(kind == PyUnicode_4BYTE_KIND);
1944 assert(index <= PyUnicode_GET_LENGTH(unicode));
1945
Victor Stinner184252a2012-06-16 02:57:41 +02001946 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001947 *ucs4 = (Py_UCS4)*str;
1948
1949 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001950 }
1951 }
1952}
1953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954static PyObject*
1955get_latin1_char(unsigned char ch)
1956{
Victor Stinnera464fc12011-10-02 20:39:30 +02001957 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001959 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001960 if (!unicode)
1961 return NULL;
1962 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001963 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 unicode_latin1[ch] = unicode;
1965 }
1966 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001967 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968}
1969
Victor Stinner985a82a2014-01-03 12:53:47 +01001970static PyObject*
1971unicode_char(Py_UCS4 ch)
1972{
1973 PyObject *unicode;
1974
1975 assert(ch <= MAX_UNICODE);
1976
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001977 if (ch < 256)
1978 return get_latin1_char(ch);
1979
Victor Stinner985a82a2014-01-03 12:53:47 +01001980 unicode = PyUnicode_New(1, ch);
1981 if (unicode == NULL)
1982 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001983
1984 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1985 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001986 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001987 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001988 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1989 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1990 }
1991 assert(_PyUnicode_CheckConsistency(unicode, 1));
1992 return unicode;
1993}
1994
Alexander Belopolsky40018472011-02-26 01:02:56 +00001995PyObject *
1996PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001998 if (u == NULL)
1999 return (PyObject*)_PyUnicode_New(size);
2000
2001 if (size < 0) {
2002 PyErr_BadInternalCall();
2003 return NULL;
2004 }
2005
2006 return PyUnicode_FromWideChar(u, size);
2007}
2008
2009PyObject *
2010PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2011{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002012 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 Py_UCS4 maxchar = 0;
2014 Py_ssize_t num_surrogates;
2015
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002016 if (u == NULL && size != 0) {
2017 PyErr_BadInternalCall();
2018 return NULL;
2019 }
2020
2021 if (size == -1) {
2022 size = wcslen(u);
2023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002025 /* If the Unicode data is known at construction time, we can apply
2026 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002029 if (size == 0)
2030 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 /* Single character Unicode objects in the Latin-1 range are
2033 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002034 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return get_latin1_char((unsigned char)*u);
2036
2037 /* If not empty and not single character, copy the Unicode data
2038 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002039 if (find_maxchar_surrogates(u, u + size,
2040 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 return NULL;
2042
Victor Stinner8faf8212011-12-08 22:14:11 +01002043 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 if (!unicode)
2045 return NULL;
2046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 switch (PyUnicode_KIND(unicode)) {
2048 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002049 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2051 break;
2052 case PyUnicode_2BYTE_KIND:
2053#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002054 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002056 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2058#endif
2059 break;
2060 case PyUnicode_4BYTE_KIND:
2061#if SIZEOF_WCHAR_T == 2
2062 /* This is the only case which has to process surrogates, thus
2063 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002064 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065#else
2066 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002067 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068#endif
2069 break;
2070 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002071 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002074 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075}
2076
Alexander Belopolsky40018472011-02-26 01:02:56 +00002077PyObject *
2078PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 if (size < 0) {
2081 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002083 return NULL;
2084 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002085 if (u != NULL)
2086 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2087 else
2088 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002089}
2090
Alexander Belopolsky40018472011-02-26 01:02:56 +00002091PyObject *
2092PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002093{
2094 size_t size = strlen(u);
2095 if (size > PY_SSIZE_T_MAX) {
2096 PyErr_SetString(PyExc_OverflowError, "input too long");
2097 return NULL;
2098 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002099 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002100}
2101
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102PyObject *
2103_PyUnicode_FromId(_Py_Identifier *id)
2104{
2105 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002106 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2107 strlen(id->string),
2108 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002109 if (!id->object)
2110 return NULL;
2111 PyUnicode_InternInPlace(&id->object);
2112 assert(!id->next);
2113 id->next = static_strings;
2114 static_strings = id;
2115 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002116 return id->object;
2117}
2118
2119void
2120_PyUnicode_ClearStaticStrings()
2121{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002122 _Py_Identifier *tmp, *s = static_strings;
2123 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002124 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002125 tmp = s->next;
2126 s->next = NULL;
2127 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002128 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002129 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002130}
2131
Benjamin Peterson0df54292012-03-26 14:50:32 -04002132/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002133
Victor Stinnerd3f08822012-05-29 12:57:52 +02002134PyObject*
2135_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002136{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002137 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002138 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002139 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002140#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002141 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002142#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002143 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002144 }
Victor Stinner785938e2011-12-11 20:09:03 +01002145 unicode = PyUnicode_New(size, 127);
2146 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002147 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002148 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2149 assert(_PyUnicode_CheckConsistency(unicode, 1));
2150 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002151}
2152
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002153static Py_UCS4
2154kind_maxchar_limit(unsigned int kind)
2155{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002156 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002157 case PyUnicode_1BYTE_KIND:
2158 return 0x80;
2159 case PyUnicode_2BYTE_KIND:
2160 return 0x100;
2161 case PyUnicode_4BYTE_KIND:
2162 return 0x10000;
2163 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002164 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002165 }
2166}
2167
Victor Stinner702c7342011-10-05 13:50:52 +02002168static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002169_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002172 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002173
Serhiy Storchaka678db842013-01-26 12:16:36 +02002174 if (size == 0)
2175 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002176 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 if (size == 1)
2178 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002180 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002181 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 if (!res)
2183 return NULL;
2184 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002185 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002187}
2188
Victor Stinnere57b1c02011-09-28 22:20:48 +02002189static PyObject*
2190_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191{
2192 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002193 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002194
Serhiy Storchaka678db842013-01-26 12:16:36 +02002195 if (size == 0)
2196 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002197 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002198 if (size == 1)
2199 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002200
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002201 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002202 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 if (!res)
2204 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002205 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002207 else {
2208 _PyUnicode_CONVERT_BYTES(
2209 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2210 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002211 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 return res;
2213}
2214
Victor Stinnere57b1c02011-09-28 22:20:48 +02002215static PyObject*
2216_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217{
2218 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002219 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002220
Serhiy Storchaka678db842013-01-26 12:16:36 +02002221 if (size == 0)
2222 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002224 if (size == 1)
2225 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002226
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002227 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 if (!res)
2230 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002231 if (max_char < 256)
2232 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2233 PyUnicode_1BYTE_DATA(res));
2234 else if (max_char < 0x10000)
2235 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2236 PyUnicode_2BYTE_DATA(res));
2237 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002239 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 return res;
2241}
2242
2243PyObject*
2244PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2245{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002246 if (size < 0) {
2247 PyErr_SetString(PyExc_ValueError, "size must be positive");
2248 return NULL;
2249 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002250 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002252 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002254 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002256 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002257 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002258 PyErr_SetString(PyExc_SystemError, "invalid kind");
2259 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261}
2262
Victor Stinnerece58de2012-04-23 23:36:38 +02002263Py_UCS4
2264_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2265{
2266 enum PyUnicode_Kind kind;
2267 void *startptr, *endptr;
2268
2269 assert(PyUnicode_IS_READY(unicode));
2270 assert(0 <= start);
2271 assert(end <= PyUnicode_GET_LENGTH(unicode));
2272 assert(start <= end);
2273
2274 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2275 return PyUnicode_MAX_CHAR_VALUE(unicode);
2276
2277 if (start == end)
2278 return 127;
2279
Victor Stinner94d558b2012-04-27 22:26:58 +02002280 if (PyUnicode_IS_ASCII(unicode))
2281 return 127;
2282
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002284 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002285 endptr = (char *)startptr + end * kind;
2286 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002287 switch(kind) {
2288 case PyUnicode_1BYTE_KIND:
2289 return ucs1lib_find_max_char(startptr, endptr);
2290 case PyUnicode_2BYTE_KIND:
2291 return ucs2lib_find_max_char(startptr, endptr);
2292 case PyUnicode_4BYTE_KIND:
2293 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002296 }
2297}
2298
Victor Stinner25a4b292011-10-06 12:31:55 +02002299/* Ensure that a string uses the most efficient storage, if it is not the
2300 case: create a new string with of the right kind. Write NULL into *p_unicode
2301 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002302static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002303unicode_adjust_maxchar(PyObject **p_unicode)
2304{
2305 PyObject *unicode, *copy;
2306 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002308 unsigned int kind;
2309
2310 assert(p_unicode != NULL);
2311 unicode = *p_unicode;
2312 assert(PyUnicode_IS_READY(unicode));
2313 if (PyUnicode_IS_ASCII(unicode))
2314 return;
2315
2316 len = PyUnicode_GET_LENGTH(unicode);
2317 kind = PyUnicode_KIND(unicode);
2318 if (kind == PyUnicode_1BYTE_KIND) {
2319 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs1lib_find_max_char(u, u + len);
2321 if (max_char >= 128)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
2324 else if (kind == PyUnicode_2BYTE_KIND) {
2325 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002326 max_char = ucs2lib_find_max_char(u, u + len);
2327 if (max_char >= 256)
2328 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 }
2330 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002333 max_char = ucs4lib_find_max_char(u, u + len);
2334 if (max_char >= 0x10000)
2335 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002336 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002337 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002338 if (copy != NULL)
2339 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002340 Py_DECREF(unicode);
2341 *p_unicode = copy;
2342}
2343
Victor Stinner034f6cf2011-09-30 02:26:44 +02002344PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002345_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002346{
Victor Stinner87af4f22011-11-21 23:03:47 +01002347 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002348 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002349
Victor Stinner034f6cf2011-09-30 02:26:44 +02002350 if (!PyUnicode_Check(unicode)) {
2351 PyErr_BadInternalCall();
2352 return NULL;
2353 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002354 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002355 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002356
Victor Stinner87af4f22011-11-21 23:03:47 +01002357 length = PyUnicode_GET_LENGTH(unicode);
2358 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002359 if (!copy)
2360 return NULL;
2361 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2362
Christian Heimesf051e432016-09-13 20:22:02 +02002363 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002364 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002365 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002366 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002367}
2368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369
Victor Stinnerbc603d12011-10-02 01:00:40 +02002370/* Widen Unicode objects to larger buffers. Don't write terminating null
2371 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372
2373void*
2374_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2375{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002376 Py_ssize_t len;
2377 void *result;
2378 unsigned int skind;
2379
Benjamin Petersonbac79492012-01-14 13:34:47 -05002380 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002381 return NULL;
2382
2383 len = PyUnicode_GET_LENGTH(s);
2384 skind = PyUnicode_KIND(s);
2385 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002386 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return NULL;
2388 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002389 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002391 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002392 if (!result)
2393 return PyErr_NoMemory();
2394 assert(skind == PyUnicode_1BYTE_KIND);
2395 _PyUnicode_CONVERT_BYTES(
2396 Py_UCS1, Py_UCS2,
2397 PyUnicode_1BYTE_DATA(s),
2398 PyUnicode_1BYTE_DATA(s) + len,
2399 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002401 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002402 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002403 if (!result)
2404 return PyErr_NoMemory();
2405 if (skind == PyUnicode_2BYTE_KIND) {
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS2, Py_UCS4,
2408 PyUnicode_2BYTE_DATA(s),
2409 PyUnicode_2BYTE_DATA(s) + len,
2410 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002412 else {
2413 assert(skind == PyUnicode_1BYTE_KIND);
2414 _PyUnicode_CONVERT_BYTES(
2415 Py_UCS1, Py_UCS4,
2416 PyUnicode_1BYTE_DATA(s),
2417 PyUnicode_1BYTE_DATA(s) + len,
2418 result);
2419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002421 default:
2422 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 }
Victor Stinner01698042011-10-04 00:04:26 +02002424 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 return NULL;
2426}
2427
2428static Py_UCS4*
2429as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2430 int copy_null)
2431{
2432 int kind;
2433 void *data;
2434 Py_ssize_t len, targetlen;
2435 if (PyUnicode_READY(string) == -1)
2436 return NULL;
2437 kind = PyUnicode_KIND(string);
2438 data = PyUnicode_DATA(string);
2439 len = PyUnicode_GET_LENGTH(string);
2440 targetlen = len;
2441 if (copy_null)
2442 targetlen++;
2443 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002444 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (!target) {
2446 PyErr_NoMemory();
2447 return NULL;
2448 }
2449 }
2450 else {
2451 if (targetsize < targetlen) {
2452 PyErr_Format(PyExc_SystemError,
2453 "string is longer than the buffer");
2454 if (copy_null && 0 < targetsize)
2455 target[0] = 0;
2456 return NULL;
2457 }
2458 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002459 if (kind == PyUnicode_1BYTE_KIND) {
2460 Py_UCS1 *start = (Py_UCS1 *) data;
2461 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002463 else if (kind == PyUnicode_2BYTE_KIND) {
2464 Py_UCS2 *start = (Py_UCS2 *) data;
2465 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2466 }
2467 else {
2468 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002469 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 if (copy_null)
2472 target[len] = 0;
2473 return target;
2474}
2475
2476Py_UCS4*
2477PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2478 int copy_null)
2479{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002480 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 PyErr_BadInternalCall();
2482 return NULL;
2483 }
2484 return as_ucs4(string, target, targetsize, copy_null);
2485}
2486
2487Py_UCS4*
2488PyUnicode_AsUCS4Copy(PyObject *string)
2489{
2490 return as_ucs4(string, NULL, 0, 1);
2491}
2492
Victor Stinner15a11362012-10-06 23:48:20 +02002493/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002494 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2495 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2496#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002497
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002498static int
2499unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2500 Py_ssize_t width, Py_ssize_t precision)
2501{
2502 Py_ssize_t length, fill, arglen;
2503 Py_UCS4 maxchar;
2504
2505 if (PyUnicode_READY(str) == -1)
2506 return -1;
2507
2508 length = PyUnicode_GET_LENGTH(str);
2509 if ((precision == -1 || precision >= length)
2510 && width <= length)
2511 return _PyUnicodeWriter_WriteStr(writer, str);
2512
2513 if (precision != -1)
2514 length = Py_MIN(precision, length);
2515
2516 arglen = Py_MAX(length, width);
2517 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2518 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2519 else
2520 maxchar = writer->maxchar;
2521
2522 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2523 return -1;
2524
2525 if (width > length) {
2526 fill = width - length;
2527 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2528 return -1;
2529 writer->pos += fill;
2530 }
2531
2532 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2533 str, 0, length);
2534 writer->pos += length;
2535 return 0;
2536}
2537
2538static int
Victor Stinner998b8062018-09-12 00:23:25 +02002539unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002540 Py_ssize_t width, Py_ssize_t precision)
2541{
2542 /* UTF-8 */
2543 Py_ssize_t length;
2544 PyObject *unicode;
2545 int res;
2546
2547 length = strlen(str);
2548 if (precision != -1)
2549 length = Py_MIN(length, precision);
2550 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2551 if (unicode == NULL)
2552 return -1;
2553
2554 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2555 Py_DECREF(unicode);
2556 return res;
2557}
2558
Victor Stinner96865452011-03-01 23:44:09 +00002559static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002560unicode_fromformat_arg(_PyUnicodeWriter *writer,
2561 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002562{
Victor Stinnere215d962012-10-06 23:03:36 +02002563 const char *p;
2564 Py_ssize_t len;
2565 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566 Py_ssize_t width;
2567 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002568 int longflag;
2569 int longlongflag;
2570 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002571 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002572
2573 p = f;
2574 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002575 zeropad = 0;
2576 if (*f == '0') {
2577 zeropad = 1;
2578 f++;
2579 }
Victor Stinner96865452011-03-01 23:44:09 +00002580
2581 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002582 width = -1;
2583 if (Py_ISDIGIT((unsigned)*f)) {
2584 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002585 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002587 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002588 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002589 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002590 return NULL;
2591 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002593 f++;
2594 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 }
2596 precision = -1;
2597 if (*f == '.') {
2598 f++;
2599 if (Py_ISDIGIT((unsigned)*f)) {
2600 precision = (*f - '0');
2601 f++;
2602 while (Py_ISDIGIT((unsigned)*f)) {
2603 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2604 PyErr_SetString(PyExc_ValueError,
2605 "precision too big");
2606 return NULL;
2607 }
2608 precision = (precision * 10) + (*f - '0');
2609 f++;
2610 }
2611 }
Victor Stinner96865452011-03-01 23:44:09 +00002612 if (*f == '%') {
2613 /* "%.3%s" => f points to "3" */
2614 f--;
2615 }
2616 }
2617 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002618 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002619 f--;
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621
2622 /* Handle %ld, %lu, %lld and %llu. */
2623 longflag = 0;
2624 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002625 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002626 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002627 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002628 longflag = 1;
2629 ++f;
2630 }
Victor Stinner96865452011-03-01 23:44:09 +00002631 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002632 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002633 longlongflag = 1;
2634 f += 2;
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 }
2637 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002638 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002639 size_tflag = 1;
2640 ++f;
2641 }
Victor Stinnere215d962012-10-06 23:03:36 +02002642
2643 if (f[1] == '\0')
2644 writer->overallocate = 0;
2645
2646 switch (*f) {
2647 case 'c':
2648 {
2649 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002650 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002651 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002652 "character argument not in range(0x110000)");
2653 return NULL;
2654 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002655 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002656 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002657 break;
2658 }
2659
2660 case 'i':
2661 case 'd':
2662 case 'u':
2663 case 'x':
2664 {
2665 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002666 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002668
2669 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002670 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002671 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002672 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002673 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002674 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002675 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002676 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_arg(*vargs, size_t));
2679 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002680 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002681 va_arg(*vargs, unsigned int));
2682 }
2683 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002684 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002685 }
2686 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002687 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002690 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002691 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002692 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002693 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002694 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002695 va_arg(*vargs, Py_ssize_t));
2696 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, int));
2699 }
2700 assert(len >= 0);
2701
Victor Stinnere215d962012-10-06 23:03:36 +02002702 if (precision < len)
2703 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704
2705 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002706 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2707 return NULL;
2708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 if (width > precision) {
2710 Py_UCS4 fillchar;
2711 fill = width - precision;
2712 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002713 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2714 return NULL;
2715 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002716 }
Victor Stinner15a11362012-10-06 23:48:20 +02002717 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002718 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002719 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2720 return NULL;
2721 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002722 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002723
Victor Stinner4a587072013-11-19 12:54:53 +01002724 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2725 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002726 break;
2727 }
2728
2729 case 'p':
2730 {
2731 char number[MAX_LONG_LONG_CHARS];
2732
2733 len = sprintf(number, "%p", va_arg(*vargs, void*));
2734 assert(len >= 0);
2735
2736 /* %p is ill-defined: ensure leading 0x. */
2737 if (number[1] == 'X')
2738 number[1] = 'x';
2739 else if (number[1] != 'x') {
2740 memmove(number + 2, number,
2741 strlen(number) + 1);
2742 number[0] = '0';
2743 number[1] = 'x';
2744 len += 2;
2745 }
2746
Victor Stinner4a587072013-11-19 12:54:53 +01002747 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002748 return NULL;
2749 break;
2750 }
2751
2752 case 's':
2753 {
2754 /* UTF-8 */
2755 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002756 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002757 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002758 break;
2759 }
2760
2761 case 'U':
2762 {
2763 PyObject *obj = va_arg(*vargs, PyObject *);
2764 assert(obj && _PyUnicode_CHECK(obj));
2765
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002766 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002767 return NULL;
2768 break;
2769 }
2770
2771 case 'V':
2772 {
2773 PyObject *obj = va_arg(*vargs, PyObject *);
2774 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002775 if (obj) {
2776 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002777 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002778 return NULL;
2779 }
2780 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002782 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002784 }
2785 break;
2786 }
2787
2788 case 'S':
2789 {
2790 PyObject *obj = va_arg(*vargs, PyObject *);
2791 PyObject *str;
2792 assert(obj);
2793 str = PyObject_Str(obj);
2794 if (!str)
2795 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002796 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002797 Py_DECREF(str);
2798 return NULL;
2799 }
2800 Py_DECREF(str);
2801 break;
2802 }
2803
2804 case 'R':
2805 {
2806 PyObject *obj = va_arg(*vargs, PyObject *);
2807 PyObject *repr;
2808 assert(obj);
2809 repr = PyObject_Repr(obj);
2810 if (!repr)
2811 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002813 Py_DECREF(repr);
2814 return NULL;
2815 }
2816 Py_DECREF(repr);
2817 break;
2818 }
2819
2820 case 'A':
2821 {
2822 PyObject *obj = va_arg(*vargs, PyObject *);
2823 PyObject *ascii;
2824 assert(obj);
2825 ascii = PyObject_ASCII(obj);
2826 if (!ascii)
2827 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002829 Py_DECREF(ascii);
2830 return NULL;
2831 }
2832 Py_DECREF(ascii);
2833 break;
2834 }
2835
2836 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002837 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002838 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002839 break;
2840
2841 default:
2842 /* if we stumble upon an unknown formatting code, copy the rest
2843 of the format string to the output string. (we cannot just
2844 skip the code, since there's no way to know what's in the
2845 argument list) */
2846 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002847 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002848 return NULL;
2849 f = p+len;
2850 return f;
2851 }
2852
2853 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002854 return f;
2855}
2856
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857PyObject *
2858PyUnicode_FromFormatV(const char *format, va_list vargs)
2859{
Victor Stinnere215d962012-10-06 23:03:36 +02002860 va_list vargs2;
2861 const char *f;
2862 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863
Victor Stinner8f674cc2013-04-17 23:02:17 +02002864 _PyUnicodeWriter_Init(&writer);
2865 writer.min_length = strlen(format) + 100;
2866 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002867
Benjamin Peterson0c212142016-09-20 20:39:33 -07002868 // Copy varags to be able to pass a reference to a subfunction.
2869 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002870
2871 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002873 f = unicode_fromformat_arg(&writer, f, &vargs2);
2874 if (f == NULL)
2875 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002878 const char *p;
2879 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002880
Victor Stinnere215d962012-10-06 23:03:36 +02002881 p = f;
2882 do
2883 {
2884 if ((unsigned char)*p > 127) {
2885 PyErr_Format(PyExc_ValueError,
2886 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2887 "string, got a non-ASCII byte: 0x%02x",
2888 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002889 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002890 }
2891 p++;
2892 }
2893 while (*p != '\0' && *p != '%');
2894 len = p - f;
2895
2896 if (*p == '\0')
2897 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002898
2899 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002901
2902 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002903 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002904 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002905 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002906 return _PyUnicodeWriter_Finish(&writer);
2907
2908 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002909 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002910 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912}
2913
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914PyObject *
2915PyUnicode_FromFormat(const char *format, ...)
2916{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 PyObject* ret;
2918 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002919
2920#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002923 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002924#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 ret = PyUnicode_FromFormatV(format, vargs);
2926 va_end(vargs);
2927 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928}
2929
Serhiy Storchakac46db922018-10-23 22:58:24 +03002930static Py_ssize_t
2931unicode_get_widechar_size(PyObject *unicode)
2932{
2933 Py_ssize_t res;
2934
2935 assert(unicode != NULL);
2936 assert(_PyUnicode_CHECK(unicode));
2937
2938 if (_PyUnicode_WSTR(unicode) != NULL) {
2939 return PyUnicode_WSTR_LENGTH(unicode);
2940 }
2941 assert(PyUnicode_IS_READY(unicode));
2942
2943 res = _PyUnicode_LENGTH(unicode);
2944#if SIZEOF_WCHAR_T == 2
2945 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2946 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2947 const Py_UCS4 *end = s + res;
2948 for (; s < end; ++s) {
2949 if (*s > 0xFFFF) {
2950 ++res;
2951 }
2952 }
2953 }
2954#endif
2955 return res;
2956}
2957
2958static void
2959unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2960{
2961 const wchar_t *wstr;
2962
2963 assert(unicode != NULL);
2964 assert(_PyUnicode_CHECK(unicode));
2965
2966 wstr = _PyUnicode_WSTR(unicode);
2967 if (wstr != NULL) {
2968 memcpy(w, wstr, size * sizeof(wchar_t));
2969 return;
2970 }
2971 assert(PyUnicode_IS_READY(unicode));
2972
2973 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2974 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2975 for (; size--; ++s, ++w) {
2976 *w = *s;
2977 }
2978 }
2979 else {
2980#if SIZEOF_WCHAR_T == 4
2981 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2982 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2983 for (; size--; ++s, ++w) {
2984 *w = *s;
2985 }
2986#else
2987 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2988 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2989 for (; size--; ++s, ++w) {
2990 Py_UCS4 ch = *s;
2991 if (ch > 0xFFFF) {
2992 assert(ch <= MAX_UNICODE);
2993 /* encode surrogate pair in this case */
2994 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2995 if (!size--)
2996 break;
2997 *w = Py_UNICODE_LOW_SURROGATE(ch);
2998 }
2999 else {
3000 *w = ch;
3001 }
3002 }
3003#endif
3004 }
3005}
3006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007#ifdef HAVE_WCHAR_H
3008
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003009/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003010
Victor Stinnerd88d9832011-09-06 02:00:05 +02003011 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003012 character) required to convert the unicode object. Ignore size argument.
3013
Victor Stinnerd88d9832011-09-06 02:00:05 +02003014 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003016 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003017Py_ssize_t
3018PyUnicode_AsWideChar(PyObject *unicode,
3019 wchar_t *w,
3020 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003021{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003022 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003023
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003024 if (unicode == NULL) {
3025 PyErr_BadInternalCall();
3026 return -1;
3027 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003030 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003031 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003032
3033 res = unicode_get_widechar_size(unicode);
3034 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003035 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003036 }
3037
3038 if (size > res) {
3039 size = res + 1;
3040 }
3041 else {
3042 res = size;
3043 }
3044 unicode_copy_as_widechar(unicode, w, size);
3045 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003046}
3047
Victor Stinner137c34c2010-09-29 10:25:54 +00003048wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003049PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003050 Py_ssize_t *size)
3051{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003052 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003053 Py_ssize_t buflen;
3054
3055 if (unicode == NULL) {
3056 PyErr_BadInternalCall();
3057 return NULL;
3058 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003059 if (!PyUnicode_Check(unicode)) {
3060 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003061 return NULL;
3062 }
3063
Serhiy Storchakac46db922018-10-23 22:58:24 +03003064 buflen = unicode_get_widechar_size(unicode);
3065 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003066 if (buffer == NULL) {
3067 PyErr_NoMemory();
3068 return NULL;
3069 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003070 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3071 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003072 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003073 }
3074 else if (wcslen(buffer) != (size_t)buflen) {
3075 PyMem_FREE(buffer);
3076 PyErr_SetString(PyExc_ValueError,
3077 "embedded null character");
3078 return NULL;
3079 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003080 return buffer;
3081}
3082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003083#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084
Alexander Belopolsky40018472011-02-26 01:02:56 +00003085PyObject *
3086PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003087{
Victor Stinner8faf8212011-12-08 22:14:11 +01003088 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 PyErr_SetString(PyExc_ValueError,
3090 "chr() arg not in range(0x110000)");
3091 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003092 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003093
Victor Stinner985a82a2014-01-03 12:53:47 +01003094 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003098PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003100 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003102 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003103 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003104 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_INCREF(obj);
3106 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003107 }
3108 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 /* For a Unicode subtype that's not a Unicode object,
3110 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003111 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003112 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003113 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003114 "Can't convert '%.100s' object to str implicitly",
3115 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003116 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003117}
3118
Alexander Belopolsky40018472011-02-26 01:02:56 +00003119PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003120PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003121 const char *encoding,
3122 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003123{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003124 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003125 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003126
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 PyErr_BadInternalCall();
3129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003131
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003132 /* Decoding bytes objects is the most common case and should be fast */
3133 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003134 if (PyBytes_GET_SIZE(obj) == 0)
3135 _Py_RETURN_UNICODE_EMPTY();
3136 v = PyUnicode_Decode(
3137 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3138 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003139 return v;
3140 }
3141
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003142 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 PyErr_SetString(PyExc_TypeError,
3144 "decoding str is not supported");
3145 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003146 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003147
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003148 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3149 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3150 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003151 "decoding to str: need a bytes-like object, %.80s found",
3152 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003153 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003154 }
Tim Petersced69f82003-09-16 20:30:58 +00003155
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003156 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003157 PyBuffer_Release(&buffer);
3158 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003160
Serhiy Storchaka05997252013-01-26 12:14:02 +02003161 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003162 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003163 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164}
3165
Victor Stinnerebe17e02016-10-12 13:57:45 +02003166/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3167 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3168 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003169int
3170_Py_normalize_encoding(const char *encoding,
3171 char *lower,
3172 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003174 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003175 char *l;
3176 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003177 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178
Victor Stinner942889a2016-09-05 15:40:10 -07003179 assert(encoding != NULL);
3180
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003181 e = encoding;
3182 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003183 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003184 punct = 0;
3185 while (1) {
3186 char c = *e;
3187 if (c == 0) {
3188 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003189 }
Victor Stinner942889a2016-09-05 15:40:10 -07003190
3191 if (Py_ISALNUM(c) || c == '.') {
3192 if (punct && l != lower) {
3193 if (l == l_end) {
3194 return 0;
3195 }
3196 *l++ = '_';
3197 }
3198 punct = 0;
3199
3200 if (l == l_end) {
3201 return 0;
3202 }
3203 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003204 }
3205 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003206 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003207 }
Victor Stinner942889a2016-09-05 15:40:10 -07003208
3209 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003210 }
3211 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003212 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003213}
3214
Alexander Belopolsky40018472011-02-26 01:02:56 +00003215PyObject *
3216PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003217 Py_ssize_t size,
3218 const char *encoding,
3219 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003220{
3221 PyObject *buffer = NULL, *unicode;
3222 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003223 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3224
3225 if (encoding == NULL) {
3226 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3227 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003228
Fred Drakee4315f52000-05-09 19:53:39 +00003229 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003230 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3231 char *lower = buflower;
3232
3233 /* Fast paths */
3234 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3235 lower += 3;
3236 if (*lower == '_') {
3237 /* Match "utf8" and "utf_8" */
3238 lower++;
3239 }
3240
3241 if (lower[0] == '8' && lower[1] == 0) {
3242 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3243 }
3244 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3245 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3246 }
3247 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3248 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3249 }
3250 }
3251 else {
3252 if (strcmp(lower, "ascii") == 0
3253 || strcmp(lower, "us_ascii") == 0) {
3254 return PyUnicode_DecodeASCII(s, size, errors);
3255 }
Steve Dowercc16be82016-09-08 10:35:16 -07003256 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003257 else if (strcmp(lower, "mbcs") == 0) {
3258 return PyUnicode_DecodeMBCS(s, size, errors);
3259 }
3260 #endif
3261 else if (strcmp(lower, "latin1") == 0
3262 || strcmp(lower, "latin_1") == 0
3263 || strcmp(lower, "iso_8859_1") == 0
3264 || strcmp(lower, "iso8859_1") == 0) {
3265 return PyUnicode_DecodeLatin1(s, size, errors);
3266 }
3267 }
Victor Stinner37296e82010-06-10 13:36:23 +00003268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269
3270 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003271 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003272 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003273 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003274 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 if (buffer == NULL)
3276 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003277 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 if (unicode == NULL)
3279 goto onError;
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003282 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003283 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003284 encoding,
3285 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 Py_DECREF(unicode);
3287 goto onError;
3288 }
3289 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003290 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003291
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 Py_XDECREF(buffer);
3294 return NULL;
3295}
3296
Alexander Belopolsky40018472011-02-26 01:02:56 +00003297PyObject *
3298PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003299 const char *encoding,
3300 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003301{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003302 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003304 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003305 }
3306
Serhiy Storchaka00939072016-10-27 21:05:49 +03003307 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3308 "PyUnicode_AsDecodedObject() is deprecated; "
3309 "use PyCodec_Decode() to decode from str", 1) < 0)
3310 return NULL;
3311
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003312 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003314
3315 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003316 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003317}
3318
Alexander Belopolsky40018472011-02-26 01:02:56 +00003319PyObject *
3320PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003321 const char *encoding,
3322 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003323{
3324 PyObject *v;
3325
3326 if (!PyUnicode_Check(unicode)) {
3327 PyErr_BadArgument();
3328 goto onError;
3329 }
3330
Serhiy Storchaka00939072016-10-27 21:05:49 +03003331 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3332 "PyUnicode_AsDecodedUnicode() is deprecated; "
3333 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3334 return NULL;
3335
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003336 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003338
3339 /* Decode via the codec registry */
3340 v = PyCodec_Decode(unicode, encoding, errors);
3341 if (v == NULL)
3342 goto onError;
3343 if (!PyUnicode_Check(v)) {
3344 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003345 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003346 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003347 encoding,
3348 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003349 Py_DECREF(v);
3350 goto onError;
3351 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003352 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003355 return NULL;
3356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003360 Py_ssize_t size,
3361 const char *encoding,
3362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363{
3364 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003365
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003366 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3370 Py_DECREF(unicode);
3371 return v;
3372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 const char *encoding,
3377 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003378{
3379 PyObject *v;
3380
3381 if (!PyUnicode_Check(unicode)) {
3382 PyErr_BadArgument();
3383 goto onError;
3384 }
3385
Serhiy Storchaka00939072016-10-27 21:05:49 +03003386 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3387 "PyUnicode_AsEncodedObject() is deprecated; "
3388 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3389 "or PyCodec_Encode() for generic encoding", 1) < 0)
3390 return NULL;
3391
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003394
3395 /* Encode via the codec registry */
3396 v = PyCodec_Encode(unicode, encoding, errors);
3397 if (v == NULL)
3398 goto onError;
3399 return v;
3400
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003402 return NULL;
3403}
3404
Victor Stinner1b579672011-12-17 05:47:23 +01003405
Victor Stinner2cba6b82018-01-10 22:46:15 +01003406static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003407unicode_encode_locale(PyObject *unicode, const char *errors,
3408 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003409{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003410 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003411
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003412 Py_ssize_t wlen;
3413 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3414 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003415 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003416 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003418 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 if (wlen2 != wlen) {
3420 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003421 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 return NULL;
3423 }
3424
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003425 char *str;
3426 size_t error_pos;
3427 const char *reason;
3428 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003429 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003430 if (res != 0) {
3431 if (res == -2) {
3432 PyObject *exc;
3433 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3434 "locale", unicode,
3435 (Py_ssize_t)error_pos,
3436 (Py_ssize_t)(error_pos+1),
3437 reason);
3438 if (exc != NULL) {
3439 PyCodec_StrictErrors(exc);
3440 Py_DECREF(exc);
3441 }
3442 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003443 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003444 else if (res == -3) {
3445 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3446 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003447 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003448 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 PyMem_Free(wstr);
3450 return NULL;
3451 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003453 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003454
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003455 PyObject *bytes = PyBytes_FromString(str);
3456 PyMem_RawFree(str);
3457 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003458}
3459
Victor Stinnerad158722010-10-27 00:25:46 +00003460PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003461PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3462{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003463 return unicode_encode_locale(unicode, errors, 1);
3464}
3465
3466PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003467PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003468{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003469 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003470 const _PyCoreConfig *config = &interp->core_config;
3471#if defined(__APPLE__)
3472 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3473#else
Victor Stinner793b5312011-04-27 00:24:21 +02003474 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3475 cannot use it to encode and decode filenames before it is loaded. Load
3476 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003477 implementation of the locale codec until the codec registry is
3478 initialized and the Python codec is loaded. See initfsencoding(). */
3479 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003480 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003481 config->filesystem_encoding,
3482 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003483 }
3484 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003485 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003486 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003487 }
Victor Stinnerad158722010-10-27 00:25:46 +00003488#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003489}
3490
Alexander Belopolsky40018472011-02-26 01:02:56 +00003491PyObject *
3492PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003493 const char *encoding,
3494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495{
3496 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003497 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003498
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 if (!PyUnicode_Check(unicode)) {
3500 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 }
Fred Drakee4315f52000-05-09 19:53:39 +00003503
Victor Stinner942889a2016-09-05 15:40:10 -07003504 if (encoding == NULL) {
3505 return _PyUnicode_AsUTF8String(unicode, errors);
3506 }
3507
Fred Drakee4315f52000-05-09 19:53:39 +00003508 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003509 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3510 char *lower = buflower;
3511
3512 /* Fast paths */
3513 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3514 lower += 3;
3515 if (*lower == '_') {
3516 /* Match "utf8" and "utf_8" */
3517 lower++;
3518 }
3519
3520 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003522 }
3523 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3524 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3525 }
3526 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3527 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3528 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003529 }
Victor Stinner942889a2016-09-05 15:40:10 -07003530 else {
3531 if (strcmp(lower, "ascii") == 0
3532 || strcmp(lower, "us_ascii") == 0) {
3533 return _PyUnicode_AsASCIIString(unicode, errors);
3534 }
Steve Dowercc16be82016-09-08 10:35:16 -07003535#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003536 else if (strcmp(lower, "mbcs") == 0) {
3537 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3538 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003539#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003540 else if (strcmp(lower, "latin1") == 0 ||
3541 strcmp(lower, "latin_1") == 0 ||
3542 strcmp(lower, "iso_8859_1") == 0 ||
3543 strcmp(lower, "iso8859_1") == 0) {
3544 return _PyUnicode_AsLatin1String(unicode, errors);
3545 }
3546 }
Victor Stinner37296e82010-06-10 13:36:23 +00003547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548
3549 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003550 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003552 return NULL;
3553
3554 /* The normal path */
3555 if (PyBytes_Check(v))
3556 return v;
3557
3558 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003559 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003560 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003561 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003562
3563 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003564 "encoder %s returned bytearray instead of bytes; "
3565 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003566 encoding);
3567 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003568 Py_DECREF(v);
3569 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003570 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003571
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003572 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3573 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003574 Py_DECREF(v);
3575 return b;
3576 }
3577
3578 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003579 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003580 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003581 encoding,
3582 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003583 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003584 return NULL;
3585}
3586
Alexander Belopolsky40018472011-02-26 01:02:56 +00003587PyObject *
3588PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003589 const char *encoding,
3590 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003591{
3592 PyObject *v;
3593
3594 if (!PyUnicode_Check(unicode)) {
3595 PyErr_BadArgument();
3596 goto onError;
3597 }
3598
Serhiy Storchaka00939072016-10-27 21:05:49 +03003599 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3600 "PyUnicode_AsEncodedUnicode() is deprecated; "
3601 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3602 return NULL;
3603
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003606
3607 /* Encode via the codec registry */
3608 v = PyCodec_Encode(unicode, encoding, errors);
3609 if (v == NULL)
3610 goto onError;
3611 if (!PyUnicode_Check(v)) {
3612 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003613 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003614 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003615 encoding,
3616 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003617 Py_DECREF(v);
3618 goto onError;
3619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003621
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 return NULL;
3624}
3625
Victor Stinner2cba6b82018-01-10 22:46:15 +01003626static PyObject*
3627unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3628 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003630 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003631
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003632 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3633 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634 return NULL;
3635 }
3636
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003637 wchar_t *wstr;
3638 size_t wlen;
3639 const char *reason;
3640 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003641 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003642 if (res != 0) {
3643 if (res == -2) {
3644 PyObject *exc;
3645 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3646 "locale", str, len,
3647 (Py_ssize_t)wlen,
3648 (Py_ssize_t)(wlen + 1),
3649 reason);
3650 if (exc != NULL) {
3651 PyCodec_StrictErrors(exc);
3652 Py_DECREF(exc);
3653 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003654 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003655 else if (res == -3) {
3656 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3657 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003658 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003659 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003660 }
Victor Stinner2f197072011-12-17 07:08:30 +01003661 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003662 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003663
3664 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3665 PyMem_RawFree(wstr);
3666 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667}
3668
3669PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003670PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3671 const char *errors)
3672{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003673 return unicode_decode_locale(str, len, errors, 1);
3674}
3675
3676PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003677PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003678{
3679 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003680 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003681}
3682
3683
3684PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003685PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003686 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003687 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3688}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689
Christian Heimes5894ba72007-11-04 11:43:14 +00003690PyObject*
3691PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3692{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003693 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003694 const _PyCoreConfig *config = &interp->core_config;
3695#if defined(__APPLE__)
3696 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3697#else
Victor Stinner793b5312011-04-27 00:24:21 +02003698 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3699 cannot use it to encode and decode filenames before it is loaded. Load
3700 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003701 implementation of the locale codec until the codec registry is
3702 initialized and the Python codec is loaded. See initfsencoding(). */
3703 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003704 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003705 config->filesystem_encoding,
3706 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003707 }
3708 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003709 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003710 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003711 }
Victor Stinnerad158722010-10-27 00:25:46 +00003712#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003713}
3714
Martin v. Löwis011e8422009-05-05 04:43:17 +00003715
3716int
3717PyUnicode_FSConverter(PyObject* arg, void* addr)
3718{
Brett Cannonec6ce872016-09-06 15:50:29 -07003719 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720 PyObject *output = NULL;
3721 Py_ssize_t size;
3722 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003723 if (arg == NULL) {
3724 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003725 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003726 return 1;
3727 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003728 path = PyOS_FSPath(arg);
3729 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003730 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003731 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003732 if (PyBytes_Check(path)) {
3733 output = path;
3734 }
3735 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3736 output = PyUnicode_EncodeFSDefault(path);
3737 Py_DECREF(path);
3738 if (!output) {
3739 return 0;
3740 }
3741 assert(PyBytes_Check(output));
3742 }
3743
Victor Stinner0ea2a462010-04-30 00:22:08 +00003744 size = PyBytes_GET_SIZE(output);
3745 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003746 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003747 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003748 Py_DECREF(output);
3749 return 0;
3750 }
3751 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003752 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753}
3754
3755
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003756int
3757PyUnicode_FSDecoder(PyObject* arg, void* addr)
3758{
Brett Cannona5711202016-09-06 19:36:01 -07003759 int is_buffer = 0;
3760 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003761 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003762 if (arg == NULL) {
3763 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003764 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003765 return 1;
3766 }
Brett Cannona5711202016-09-06 19:36:01 -07003767
3768 is_buffer = PyObject_CheckBuffer(arg);
3769 if (!is_buffer) {
3770 path = PyOS_FSPath(arg);
3771 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003772 return 0;
3773 }
Brett Cannona5711202016-09-06 19:36:01 -07003774 }
3775 else {
3776 path = arg;
3777 Py_INCREF(arg);
3778 }
3779
3780 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003781 output = path;
3782 }
3783 else if (PyBytes_Check(path) || is_buffer) {
3784 PyObject *path_bytes = NULL;
3785
3786 if (!PyBytes_Check(path) &&
3787 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003788 "path should be string, bytes, or os.PathLike, not %.200s",
3789 Py_TYPE(arg)->tp_name)) {
3790 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003791 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003792 }
3793 path_bytes = PyBytes_FromObject(path);
3794 Py_DECREF(path);
3795 if (!path_bytes) {
3796 return 0;
3797 }
3798 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3799 PyBytes_GET_SIZE(path_bytes));
3800 Py_DECREF(path_bytes);
3801 if (!output) {
3802 return 0;
3803 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003804 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003805 else {
3806 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003807 "path should be string, bytes, or os.PathLike, not %.200s",
3808 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003809 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003810 return 0;
3811 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003812 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003813 Py_DECREF(output);
3814 return 0;
3815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003817 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003818 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003819 Py_DECREF(output);
3820 return 0;
3821 }
3822 *(PyObject**)addr = output;
3823 return Py_CLEANUP_SUPPORTED;
3824}
3825
3826
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003827const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003829{
Christian Heimesf3863112007-11-22 07:46:41 +00003830 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003832 if (!PyUnicode_Check(unicode)) {
3833 PyErr_BadArgument();
3834 return NULL;
3835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003836 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003837 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003839 if (PyUnicode_UTF8(unicode) == NULL) {
3840 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003841 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842 if (bytes == NULL)
3843 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3845 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003846 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 Py_DECREF(bytes);
3848 return NULL;
3849 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003851 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 PyBytes_AS_STRING(bytes),
3853 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 Py_DECREF(bytes);
3855 }
3856
3857 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003858 *psize = PyUnicode_UTF8_LENGTH(unicode);
3859 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003860}
3861
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003862const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3866}
3867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868Py_UNICODE *
3869PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 if (!PyUnicode_Check(unicode)) {
3872 PyErr_BadArgument();
3873 return NULL;
3874 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003875 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3876 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003878 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880
Serhiy Storchakac46db922018-10-23 22:58:24 +03003881 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3882 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3883 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003886 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3887 if (w == NULL) {
3888 PyErr_NoMemory();
3889 return NULL;
3890 }
3891 unicode_copy_as_widechar(unicode, w, wlen + 1);
3892 _PyUnicode_WSTR(unicode) = w;
3893 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3894 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 }
3896 }
3897 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003899 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003900}
3901
Alexander Belopolsky40018472011-02-26 01:02:56 +00003902Py_UNICODE *
3903PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906}
3907
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003908const Py_UNICODE *
3909_PyUnicode_AsUnicode(PyObject *unicode)
3910{
3911 Py_ssize_t size;
3912 const Py_UNICODE *wstr;
3913
3914 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3915 if (wstr && wcslen(wstr) != (size_t)size) {
3916 PyErr_SetString(PyExc_ValueError, "embedded null character");
3917 return NULL;
3918 }
3919 return wstr;
3920}
3921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922
Alexander Belopolsky40018472011-02-26 01:02:56 +00003923Py_ssize_t
3924PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925{
3926 if (!PyUnicode_Check(unicode)) {
3927 PyErr_BadArgument();
3928 goto onError;
3929 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003930 if (_PyUnicode_WSTR(unicode) == NULL) {
3931 if (PyUnicode_AsUnicode(unicode) == NULL)
3932 goto onError;
3933 }
3934 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return -1;
3938}
3939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940Py_ssize_t
3941PyUnicode_GetLength(PyObject *unicode)
3942{
Victor Stinner07621332012-06-16 04:53:46 +02003943 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 PyErr_BadArgument();
3945 return -1;
3946 }
Victor Stinner07621332012-06-16 04:53:46 +02003947 if (PyUnicode_READY(unicode) == -1)
3948 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 return PyUnicode_GET_LENGTH(unicode);
3950}
3951
3952Py_UCS4
3953PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3954{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003955 void *data;
3956 int kind;
3957
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003958 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003959 PyErr_BadArgument();
3960 return (Py_UCS4)-1;
3961 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003962 if (PyUnicode_READY(unicode) == -1) {
3963 return (Py_UCS4)-1;
3964 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003965 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003966 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967 return (Py_UCS4)-1;
3968 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003969 data = PyUnicode_DATA(unicode);
3970 kind = PyUnicode_KIND(unicode);
3971 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972}
3973
3974int
3975PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3976{
3977 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003978 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 return -1;
3980 }
Victor Stinner488fa492011-12-12 00:01:39 +01003981 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003982 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003983 PyErr_SetString(PyExc_IndexError, "string index out of range");
3984 return -1;
3985 }
Victor Stinner488fa492011-12-12 00:01:39 +01003986 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003987 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003988 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3989 PyErr_SetString(PyExc_ValueError, "character out of range");
3990 return -1;
3991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3993 index, ch);
3994 return 0;
3995}
3996
Alexander Belopolsky40018472011-02-26 01:02:56 +00003997const char *
3998PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003999{
Victor Stinner42cb4622010-09-01 19:39:01 +00004000 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004001}
4002
Victor Stinner554f3f02010-06-16 23:33:54 +00004003/* create or adjust a UnicodeDecodeError */
4004static void
4005make_decode_exception(PyObject **exceptionObject,
4006 const char *encoding,
4007 const char *input, Py_ssize_t length,
4008 Py_ssize_t startpos, Py_ssize_t endpos,
4009 const char *reason)
4010{
4011 if (*exceptionObject == NULL) {
4012 *exceptionObject = PyUnicodeDecodeError_Create(
4013 encoding, input, length, startpos, endpos, reason);
4014 }
4015 else {
4016 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4017 goto onError;
4018 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4019 goto onError;
4020 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4021 goto onError;
4022 }
4023 return;
4024
4025onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004026 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004027}
4028
Steve Dowercc16be82016-09-08 10:35:16 -07004029#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030/* error handling callback helper:
4031 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004032 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 and adjust various state variables.
4034 return 0 on success, -1 on error
4035*/
4036
Alexander Belopolsky40018472011-02-26 01:02:56 +00004037static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038unicode_decode_call_errorhandler_wchar(
4039 const char *errors, PyObject **errorHandler,
4040 const char *encoding, const char *reason,
4041 const char **input, const char **inend, Py_ssize_t *startinpos,
4042 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4043 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004045 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046
4047 PyObject *restuple = NULL;
4048 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004049 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004050 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004051 Py_ssize_t requiredsize;
4052 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004054 wchar_t *repwstr;
4055 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004057 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4058 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 *errorHandler = PyCodec_LookupError(errors);
4062 if (*errorHandler == NULL)
4063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 }
4065
Victor Stinner554f3f02010-06-16 23:33:54 +00004066 make_decode_exception(exceptionObject,
4067 encoding,
4068 *input, *inend - *input,
4069 *startinpos, *endinpos,
4070 reason);
4071 if (*exceptionObject == NULL)
4072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004074 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004078 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004081 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004083
4084 /* Copy back the bytes variables, which might have been modified by the
4085 callback */
4086 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4087 if (!inputobj)
4088 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004089 *input = PyBytes_AS_STRING(inputobj);
4090 insize = PyBytes_GET_SIZE(inputobj);
4091 *inend = *input + insize;
4092 /* we can DECREF safely, as the exception has another reference,
4093 so the object won't go away. */
4094 Py_DECREF(inputobj);
4095
4096 if (newpos<0)
4097 newpos = insize+newpos;
4098 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004099 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004100 goto onError;
4101 }
4102
4103 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4104 if (repwstr == NULL)
4105 goto onError;
4106 /* need more space? (at least enough for what we
4107 have+the replacement+the rest of the string (starting
4108 at the new input position), so we won't have to check space
4109 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004110 requiredsize = *outpos;
4111 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4112 goto overflow;
4113 requiredsize += repwlen;
4114 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4115 goto overflow;
4116 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004117 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004118 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004119 requiredsize = 2*outsize;
4120 if (unicode_resize(output, requiredsize) < 0)
4121 goto onError;
4122 }
4123 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4124 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004125 *endinpos = newpos;
4126 *inptr = *input + newpos;
4127
4128 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004129 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004130 return 0;
4131
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004132 overflow:
4133 PyErr_SetString(PyExc_OverflowError,
4134 "decoded result is too long for a Python string");
4135
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004136 onError:
4137 Py_XDECREF(restuple);
4138 return -1;
4139}
Steve Dowercc16be82016-09-08 10:35:16 -07004140#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004141
4142static int
4143unicode_decode_call_errorhandler_writer(
4144 const char *errors, PyObject **errorHandler,
4145 const char *encoding, const char *reason,
4146 const char **input, const char **inend, Py_ssize_t *startinpos,
4147 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4148 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4149{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004150 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004151
4152 PyObject *restuple = NULL;
4153 PyObject *repunicode = NULL;
4154 Py_ssize_t insize;
4155 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004156 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004157 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004158 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004159 int need_to_grow = 0;
4160 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004161
4162 if (*errorHandler == NULL) {
4163 *errorHandler = PyCodec_LookupError(errors);
4164 if (*errorHandler == NULL)
4165 goto onError;
4166 }
4167
4168 make_decode_exception(exceptionObject,
4169 encoding,
4170 *input, *inend - *input,
4171 *startinpos, *endinpos,
4172 reason);
4173 if (*exceptionObject == NULL)
4174 goto onError;
4175
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004176 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 if (restuple == NULL)
4178 goto onError;
4179 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004180 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004181 goto onError;
4182 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004183 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004184 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004185
4186 /* Copy back the bytes variables, which might have been modified by the
4187 callback */
4188 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4189 if (!inputobj)
4190 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004191 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004192 *input = PyBytes_AS_STRING(inputobj);
4193 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004194 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004195 /* we can DECREF safely, as the exception has another reference,
4196 so the object won't go away. */
4197 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004201 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004202 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004204 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205
Victor Stinner170ca6f2013-04-18 00:25:28 +02004206 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004207 if (replen > 1) {
4208 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004209 need_to_grow = 1;
4210 }
4211 new_inptr = *input + newpos;
4212 if (*inend - new_inptr > remain) {
4213 /* We don't know the decoding algorithm here so we make the worst
4214 assumption that one byte decodes to one unicode character.
4215 If unfortunately one byte could decode to more unicode characters,
4216 the decoder may write out-of-bound then. Is it possible for the
4217 algorithms using this function? */
4218 writer->min_length += *inend - new_inptr - remain;
4219 need_to_grow = 1;
4220 }
4221 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004222 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004223 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004224 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4225 goto onError;
4226 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004228 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004231 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004234 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240}
4241
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004242/* --- UTF-7 Codec -------------------------------------------------------- */
4243
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244/* See RFC2152 for details. We encode conservatively and decode liberally. */
4245
4246/* Three simple macros defining base-64. */
4247
4248/* Is c a base-64 character? */
4249
4250#define IS_BASE64(c) \
4251 (((c) >= 'A' && (c) <= 'Z') || \
4252 ((c) >= 'a' && (c) <= 'z') || \
4253 ((c) >= '0' && (c) <= '9') || \
4254 (c) == '+' || (c) == '/')
4255
4256/* given that c is a base-64 character, what is its base-64 value? */
4257
4258#define FROM_BASE64(c) \
4259 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4260 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4261 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4262 (c) == '+' ? 62 : 63)
4263
4264/* What is the base-64 character of the bottom 6 bits of n? */
4265
4266#define TO_BASE64(n) \
4267 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4268
4269/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4270 * decoded as itself. We are permissive on decoding; the only ASCII
4271 * byte not decoding to itself is the + which begins a base64
4272 * string. */
4273
4274#define DECODE_DIRECT(c) \
4275 ((c) <= 127 && (c) != '+')
4276
4277/* The UTF-7 encoder treats ASCII characters differently according to
4278 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4279 * the above). See RFC2152. This array identifies these different
4280 * sets:
4281 * 0 : "Set D"
4282 * alphanumeric and '(),-./:?
4283 * 1 : "Set O"
4284 * !"#$%&*;<=>@[]^_`{|}
4285 * 2 : "whitespace"
4286 * ht nl cr sp
4287 * 3 : special (must be base64 encoded)
4288 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4289 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290
Tim Petersced69f82003-09-16 20:30:58 +00004291static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292char utf7_category[128] = {
4293/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4294 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4295/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4296 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4297/* sp ! " # $ % & ' ( ) * + , - . / */
4298 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4299/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4301/* @ A B C D E F G H I J K L M N O */
4302 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4303/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4304 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4305/* ` a b c d e f g h i j k l m n o */
4306 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4307/* p q r s t u v w x y z { | } ~ del */
4308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004309};
4310
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311/* ENCODE_DIRECT: this character should be encoded as itself. The
4312 * answer depends on whether we are encoding set O as itself, and also
4313 * on whether we are encoding whitespace as itself. RFC2152 makes it
4314 * clear that the answers to these questions vary between
4315 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004316
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317#define ENCODE_DIRECT(c, directO, directWS) \
4318 ((c) < 128 && (c) > 0 && \
4319 ((utf7_category[(c)] == 0) || \
4320 (directWS && (utf7_category[(c)] == 2)) || \
4321 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Alexander Belopolsky40018472011-02-26 01:02:56 +00004323PyObject *
4324PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004325 Py_ssize_t size,
4326 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004328 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4329}
4330
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331/* The decoder. The only state we preserve is our read position,
4332 * i.e. how many characters we have consumed. So if we end in the
4333 * middle of a shift sequence we have to back off the read position
4334 * and the output to the beginning of the sequence, otherwise we lose
4335 * all the shift state (seen bits, number of bits seen, high
4336 * surrogate). */
4337
Alexander Belopolsky40018472011-02-26 01:02:56 +00004338PyObject *
4339PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004340 Py_ssize_t size,
4341 const char *errors,
4342 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004345 Py_ssize_t startinpos;
4346 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 const char *errmsg = "";
4350 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 unsigned int base64bits = 0;
4353 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004354 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 PyObject *errorHandler = NULL;
4356 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004358 if (size == 0) {
4359 if (consumed)
4360 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004361 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004362 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004365 _PyUnicodeWriter_Init(&writer);
4366 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367
4368 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004369 e = s + size;
4370
4371 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004374 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 if (inShift) { /* in a base-64 section */
4377 if (IS_BASE64(ch)) { /* consume a base-64 character */
4378 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4379 base64bits += 6;
4380 s++;
4381 if (base64bits >= 16) {
4382 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004383 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 base64bits -= 16;
4385 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004386 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 if (surrogate) {
4388 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004389 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4390 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004391 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004392 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004394 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 }
4396 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004397 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004398 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 }
4401 }
Victor Stinner551ac952011-11-29 22:58:13 +01004402 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 /* first surrogate */
4404 surrogate = outCh;
4405 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004407 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 }
4410 }
4411 }
4412 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 if (base64bits > 0) { /* left-over bits */
4415 if (base64bits >= 6) {
4416 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004417 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 errmsg = "partial character in shift sequence";
4419 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 /* Some bits remain; they should be zero */
4423 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004424 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 errmsg = "non-zero padding bits in shift sequence";
4426 goto utf7Error;
4427 }
4428 }
4429 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004430 if (surrogate && DECODE_DIRECT(ch)) {
4431 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4432 goto onError;
4433 }
4434 surrogate = 0;
4435 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 /* '-' is absorbed; other terminating
4437 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004438 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004440 }
4441 }
4442 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 s++; /* consume '+' */
4445 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004447 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004448 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004450 else if (s < e && !IS_BASE64(*s)) {
4451 s++;
4452 errmsg = "ill-formed sequence";
4453 goto utf7Error;
4454 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004457 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004460 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461 }
4462 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004465 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 else {
4469 startinpos = s-starts;
4470 s++;
4471 errmsg = "unexpected special character";
4472 goto utf7Error;
4473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 errors, &errorHandler,
4479 "utf7", errmsg,
4480 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004481 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 }
4484
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485 /* end of string */
4486
4487 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4488 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004489 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 if (surrogate ||
4491 (base64bits >= 6) ||
4492 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004494 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 errors, &errorHandler,
4496 "utf7", "unterminated shift sequence",
4497 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004498 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 goto onError;
4500 if (s < e)
4501 goto restart;
4502 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504
4505 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004506 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004508 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004509 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004510 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004511 writer.kind, writer.data, shiftOutStart);
4512 Py_XDECREF(errorHandler);
4513 Py_XDECREF(exc);
4514 _PyUnicodeWriter_Dealloc(&writer);
4515 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004516 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004517 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 }
4519 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004520 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 Py_XDECREF(errorHandler);
4525 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004526 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 Py_XDECREF(errorHandler);
4530 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 return NULL;
4533}
4534
4535
Alexander Belopolsky40018472011-02-26 01:02:56 +00004536PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004537_PyUnicode_EncodeUTF7(PyObject *str,
4538 int base64SetO,
4539 int base64WhiteSpace,
4540 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004542 int kind;
4543 void *data;
4544 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004545 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004547 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 unsigned int base64bits = 0;
4549 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 char * out;
4551 char * start;
4552
Benjamin Petersonbac79492012-01-14 13:34:47 -05004553 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004554 return NULL;
4555 kind = PyUnicode_KIND(str);
4556 data = PyUnicode_DATA(str);
4557 len = PyUnicode_GET_LENGTH(str);
4558
4559 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004562 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004563 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004564 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004565 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 if (v == NULL)
4567 return NULL;
4568
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004569 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004570 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004571 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (inShift) {
4574 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4575 /* shifting out */
4576 if (base64bits) { /* output remaining bits */
4577 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4578 base64buffer = 0;
4579 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 }
4581 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* Characters not in the BASE64 set implicitly unshift the sequence
4583 so no '-' is required, except if the character is itself a '-' */
4584 if (IS_BASE64(ch) || ch == '-') {
4585 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 *out++ = (char) ch;
4588 }
4589 else {
4590 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004591 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 else { /* not in a shift sequence */
4594 if (ch == '+') {
4595 *out++ = '+';
4596 *out++ = '-';
4597 }
4598 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4599 *out++ = (char) ch;
4600 }
4601 else {
4602 *out++ = '+';
4603 inShift = 1;
4604 goto encode_char;
4605 }
4606 }
4607 continue;
4608encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004610 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004611
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 /* code first surrogate */
4613 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004614 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 while (base64bits >= 6) {
4616 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4617 base64bits -= 6;
4618 }
4619 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004620 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 base64bits += 16;
4623 base64buffer = (base64buffer << 16) | ch;
4624 while (base64bits >= 6) {
4625 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4626 base64bits -= 6;
4627 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 if (base64bits)
4630 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4631 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004633 if (_PyBytes_Resize(&v, out - start) < 0)
4634 return NULL;
4635 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004637PyObject *
4638PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4639 Py_ssize_t size,
4640 int base64SetO,
4641 int base64WhiteSpace,
4642 const char *errors)
4643{
4644 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004645 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004646 if (tmp == NULL)
4647 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004648 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004649 base64WhiteSpace, errors);
4650 Py_DECREF(tmp);
4651 return result;
4652}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654#undef IS_BASE64
4655#undef FROM_BASE64
4656#undef TO_BASE64
4657#undef DECODE_DIRECT
4658#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660/* --- UTF-8 Codec -------------------------------------------------------- */
4661
Alexander Belopolsky40018472011-02-26 01:02:56 +00004662PyObject *
4663PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004664 Py_ssize_t size,
4665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666{
Walter Dörwald69652032004-09-07 20:24:22 +00004667 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4668}
4669
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670#include "stringlib/asciilib.h"
4671#include "stringlib/codecs.h"
4672#include "stringlib/undef.h"
4673
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004674#include "stringlib/ucs1lib.h"
4675#include "stringlib/codecs.h"
4676#include "stringlib/undef.h"
4677
4678#include "stringlib/ucs2lib.h"
4679#include "stringlib/codecs.h"
4680#include "stringlib/undef.h"
4681
4682#include "stringlib/ucs4lib.h"
4683#include "stringlib/codecs.h"
4684#include "stringlib/undef.h"
4685
Antoine Pitrouab868312009-01-10 15:40:25 +00004686/* Mask to quickly check whether a C 'long' contains a
4687 non-ASCII, UTF8-encoded char. */
4688#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004689# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004690#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004691# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004692#else
4693# error C 'long' size should be either 4 or 8!
4694#endif
4695
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696static Py_ssize_t
4697ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004698{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004700 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004702 /*
4703 * Issue #17237: m68k is a bit different from most architectures in
4704 * that objects do not use "natural alignment" - for example, int and
4705 * long are only aligned at 2-byte boundaries. Therefore the assert()
4706 * won't work; also, tests have shown that skipping the "optimised
4707 * version" will even speed up m68k.
4708 */
4709#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004711 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4712 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713 /* Fast path, see in STRINGLIB(utf8_decode) for
4714 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004715 /* Help allocation */
4716 const char *_p = p;
4717 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 while (_p < aligned_end) {
4719 unsigned long value = *(const unsigned long *) _p;
4720 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 *((unsigned long *)q) = value;
4723 _p += SIZEOF_LONG;
4724 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004725 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 p = _p;
4727 while (p < end) {
4728 if ((unsigned char)*p & 0x80)
4729 break;
4730 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004735#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 while (p < end) {
4737 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4738 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004739 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004740 /* Help allocation */
4741 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 while (_p < aligned_end) {
4743 unsigned long value = *(unsigned long *) _p;
4744 if (value & ASCII_CHAR_MASK)
4745 break;
4746 _p += SIZEOF_LONG;
4747 }
4748 p = _p;
4749 if (_p == end)
4750 break;
4751 }
4752 if ((unsigned char)*p & 0x80)
4753 break;
4754 ++p;
4755 }
4756 memcpy(dest, start, p - start);
4757 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758}
Antoine Pitrouab868312009-01-10 15:40:25 +00004759
Victor Stinner785938e2011-12-11 20:09:03 +01004760PyObject *
4761PyUnicode_DecodeUTF8Stateful(const char *s,
4762 Py_ssize_t size,
4763 const char *errors,
4764 Py_ssize_t *consumed)
4765{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004766 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004767 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769
4770 Py_ssize_t startinpos;
4771 Py_ssize_t endinpos;
4772 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004773 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004775 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004776
4777 if (size == 0) {
4778 if (consumed)
4779 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004780 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004781 }
4782
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4784 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004785 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786 *consumed = 1;
4787 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004788 }
4789
Victor Stinner8f674cc2013-04-17 23:02:17 +02004790 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004791 writer.min_length = size;
4792 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004794
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004795 writer.pos = ascii_decode(s, end, writer.data);
4796 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 while (s < end) {
4798 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004799 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004800
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 if (PyUnicode_IS_ASCII(writer.buffer))
4803 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004807 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 } else {
4809 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004810 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 }
4812
4813 switch (ch) {
4814 case 0:
4815 if (s == end || consumed)
4816 goto End;
4817 errmsg = "unexpected end of data";
4818 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004819 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 break;
4821 case 1:
4822 errmsg = "invalid start byte";
4823 startinpos = s - starts;
4824 endinpos = startinpos + 1;
4825 break;
4826 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004827 case 3:
4828 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 errmsg = "invalid continuation byte";
4830 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004831 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 break;
4833 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004834 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 goto onError;
4836 continue;
4837 }
4838
Victor Stinner1d65d912015-10-05 13:43:50 +02004839 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004840 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004841
4842 switch (error_handler) {
4843 case _Py_ERROR_IGNORE:
4844 s += (endinpos - startinpos);
4845 break;
4846
4847 case _Py_ERROR_REPLACE:
4848 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4849 goto onError;
4850 s += (endinpos - startinpos);
4851 break;
4852
4853 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004854 {
4855 Py_ssize_t i;
4856
Victor Stinner1d65d912015-10-05 13:43:50 +02004857 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4858 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004859 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004860 ch = (Py_UCS4)(unsigned char)(starts[i]);
4861 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4862 ch + 0xdc00);
4863 writer.pos++;
4864 }
4865 s += (endinpos - startinpos);
4866 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004867 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004868
4869 default:
4870 if (unicode_decode_call_errorhandler_writer(
4871 errors, &error_handler_obj,
4872 "utf-8", errmsg,
4873 &starts, &end, &startinpos, &endinpos, &exc, &s,
4874 &writer))
4875 goto onError;
4876 }
Victor Stinner785938e2011-12-11 20:09:03 +01004877 }
4878
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 if (consumed)
4881 *consumed = s - starts;
4882
Victor Stinner1d65d912015-10-05 13:43:50 +02004883 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004885 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886
4887onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004888 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004892}
4893
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004894
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004895/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4896 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004897
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004898 On success, write a pointer to a newly allocated wide character string into
4899 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4900 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004901
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004902 On memory allocation failure, return -1.
4903
4904 On decoding error (if surrogateescape is zero), return -2. If wlen is
4905 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4906 is not NULL, write the decoding error message into *reason. */
4907int
4908_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004909 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004910{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004911 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004912 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 wchar_t *unicode;
4914 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004915
Victor Stinner3d4226a2018-08-29 22:21:32 +02004916 int surrogateescape = 0;
4917 int surrogatepass = 0;
4918 switch (errors)
4919 {
4920 case _Py_ERROR_STRICT:
4921 break;
4922 case _Py_ERROR_SURROGATEESCAPE:
4923 surrogateescape = 1;
4924 break;
4925 case _Py_ERROR_SURROGATEPASS:
4926 surrogatepass = 1;
4927 break;
4928 default:
4929 return -3;
4930 }
4931
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004932 /* Note: size will always be longer than the resulting Unicode
4933 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004934 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004935 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004936 }
4937
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004938 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004939 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004940 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004941 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004942
4943 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004944 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004948#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004950#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004952#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 if (ch > 0xFF) {
4954#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004955 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004957 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004958 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4960 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4961#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004962 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004964 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004966 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004967
4968 if (surrogateescape) {
4969 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4970 }
4971 else {
4972 /* Is it a valid three-byte code? */
4973 if (surrogatepass
4974 && (e - s) >= 3
4975 && (s[0] & 0xf0) == 0xe0
4976 && (s[1] & 0xc0) == 0x80
4977 && (s[2] & 0xc0) == 0x80)
4978 {
4979 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4980 s += 3;
4981 unicode[outpos++] = ch;
4982 }
4983 else {
4984 PyMem_RawFree(unicode );
4985 if (reason != NULL) {
4986 switch (ch) {
4987 case 0:
4988 *reason = "unexpected end of data";
4989 break;
4990 case 1:
4991 *reason = "invalid start byte";
4992 break;
4993 /* 2, 3, 4 */
4994 default:
4995 *reason = "invalid continuation byte";
4996 break;
4997 }
4998 }
4999 if (wlen != NULL) {
5000 *wlen = s - orig_s;
5001 }
5002 return -2;
5003 }
5004 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005006 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005008 if (wlen) {
5009 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005010 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005011 *wstr = unicode;
5012 return 0;
5013}
5014
5015wchar_t*
5016_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5017{
5018 wchar_t *wstr;
5019 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5020 if (res != 0) {
5021 return NULL;
5022 }
5023 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005024}
5025
Antoine Pitrouab868312009-01-10 15:40:25 +00005026
Victor Stinnere47e6982017-12-21 15:45:16 +01005027/* UTF-8 encoder using the surrogateescape error handler .
5028
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005029 On success, return 0 and write the newly allocated character string (use
5030 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On encoding failure, return -2 and write the position of the invalid
5033 surrogate character into *error_pos (if error_pos is set) and the decoding
5034 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005035
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005036 On memory allocation failure, return -1. */
5037int
5038_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005039 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005040{
5041 const Py_ssize_t max_char_size = 4;
5042 Py_ssize_t len = wcslen(text);
5043
5044 assert(len >= 0);
5045
Victor Stinner3d4226a2018-08-29 22:21:32 +02005046 int surrogateescape = 0;
5047 int surrogatepass = 0;
5048 switch (errors)
5049 {
5050 case _Py_ERROR_STRICT:
5051 break;
5052 case _Py_ERROR_SURROGATEESCAPE:
5053 surrogateescape = 1;
5054 break;
5055 case _Py_ERROR_SURROGATEPASS:
5056 surrogatepass = 1;
5057 break;
5058 default:
5059 return -3;
5060 }
5061
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005062 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5063 return -1;
5064 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005065 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 if (raw_malloc) {
5067 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005068 }
5069 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005070 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005071 }
5072 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005073 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005074 }
5075
5076 char *p = bytes;
5077 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005078 for (i = 0; i < len; ) {
5079 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005080 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005081 i++;
5082#if Py_UNICODE_SIZE == 2
5083 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5084 && i < len
5085 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5086 {
5087 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5088 i++;
5089 }
5090#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005091
5092 if (ch < 0x80) {
5093 /* Encode ASCII */
5094 *p++ = (char) ch;
5095
5096 }
5097 else if (ch < 0x0800) {
5098 /* Encode Latin-1 */
5099 *p++ = (char)(0xc0 | (ch >> 6));
5100 *p++ = (char)(0x80 | (ch & 0x3f));
5101 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005102 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005103 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005105 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005106 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005107 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005108 if (reason != NULL) {
5109 *reason = "encoding error";
5110 }
5111 if (raw_malloc) {
5112 PyMem_RawFree(bytes);
5113 }
5114 else {
5115 PyMem_Free(bytes);
5116 }
5117 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005118 }
5119 *p++ = (char)(ch & 0xff);
5120 }
5121 else if (ch < 0x10000) {
5122 *p++ = (char)(0xe0 | (ch >> 12));
5123 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5124 *p++ = (char)(0x80 | (ch & 0x3f));
5125 }
5126 else { /* ch >= 0x10000 */
5127 assert(ch <= MAX_UNICODE);
5128 /* Encode UCS4 Unicode ordinals */
5129 *p++ = (char)(0xf0 | (ch >> 18));
5130 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5131 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5132 *p++ = (char)(0x80 | (ch & 0x3f));
5133 }
5134 }
5135 *p++ = '\0';
5136
5137 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005138 char *bytes2;
5139 if (raw_malloc) {
5140 bytes2 = PyMem_RawRealloc(bytes, final_size);
5141 }
5142 else {
5143 bytes2 = PyMem_Realloc(bytes, final_size);
5144 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005145 if (bytes2 == NULL) {
5146 if (error_pos != NULL) {
5147 *error_pos = (size_t)-1;
5148 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005149 if (raw_malloc) {
5150 PyMem_RawFree(bytes);
5151 }
5152 else {
5153 PyMem_Free(bytes);
5154 }
5155 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005156 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005157 *str = bytes2;
5158 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005159}
5160
5161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162/* Primary internal function which creates utf8 encoded bytes objects.
5163
5164 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005165 and allocate exactly as much space needed at the end. Else allocate the
5166 maximum possible needed (4 result bytes per Unicode character), and return
5167 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005168*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005169PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005170_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171{
Victor Stinner6099a032011-12-18 14:22:26 +01005172 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173 void *data;
5174 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176 if (!PyUnicode_Check(unicode)) {
5177 PyErr_BadArgument();
5178 return NULL;
5179 }
5180
5181 if (PyUnicode_READY(unicode) == -1)
5182 return NULL;
5183
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005184 if (PyUnicode_UTF8(unicode))
5185 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5186 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187
5188 kind = PyUnicode_KIND(unicode);
5189 data = PyUnicode_DATA(unicode);
5190 size = PyUnicode_GET_LENGTH(unicode);
5191
Benjamin Petersonead6b532011-12-20 17:23:42 -06005192 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005193 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005194 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005195 case PyUnicode_1BYTE_KIND:
5196 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5197 assert(!PyUnicode_IS_ASCII(unicode));
5198 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5199 case PyUnicode_2BYTE_KIND:
5200 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5201 case PyUnicode_4BYTE_KIND:
5202 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204}
5205
Alexander Belopolsky40018472011-02-26 01:02:56 +00005206PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5208 Py_ssize_t size,
5209 const char *errors)
5210{
5211 PyObject *v, *unicode;
5212
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005213 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214 if (unicode == NULL)
5215 return NULL;
5216 v = _PyUnicode_AsUTF8String(unicode, errors);
5217 Py_DECREF(unicode);
5218 return v;
5219}
5220
5221PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005222PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005224 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225}
5226
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227/* --- UTF-32 Codec ------------------------------------------------------- */
5228
5229PyObject *
5230PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234{
5235 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5236}
5237
5238PyObject *
5239PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 Py_ssize_t size,
5241 const char *errors,
5242 int *byteorder,
5243 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244{
5245 const char *starts = s;
5246 Py_ssize_t startinpos;
5247 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005248 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005249 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005250 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005251 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253 PyObject *errorHandler = NULL;
5254 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005255
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256 q = (unsigned char *)s;
5257 e = q + size;
5258
5259 if (byteorder)
5260 bo = *byteorder;
5261
5262 /* Check for BOM marks (U+FEFF) in the input and adjust current
5263 byte order setting accordingly. In native mode, the leading BOM
5264 mark is skipped, in all other modes, it is copied to the output
5265 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005266 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005267 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005268 if (bom == 0x0000FEFF) {
5269 bo = -1;
5270 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005272 else if (bom == 0xFFFE0000) {
5273 bo = 1;
5274 q += 4;
5275 }
5276 if (byteorder)
5277 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005278 }
5279
Victor Stinnere64322e2012-10-30 23:12:47 +01005280 if (q == e) {
5281 if (consumed)
5282 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005283 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005284 }
5285
Victor Stinnere64322e2012-10-30 23:12:47 +01005286#ifdef WORDS_BIGENDIAN
5287 le = bo < 0;
5288#else
5289 le = bo <= 0;
5290#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005291 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005292
Victor Stinner8f674cc2013-04-17 23:02:17 +02005293 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005294 writer.min_length = (e - q + 3) / 4;
5295 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005297
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 while (1) {
5299 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005301
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005303 enum PyUnicode_Kind kind = writer.kind;
5304 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005307 if (le) {
5308 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005309 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 if (ch > maxch)
5311 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 if (kind != PyUnicode_1BYTE_KIND &&
5313 Py_UNICODE_IS_SURROGATE(ch))
5314 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005316 q += 4;
5317 } while (q <= last);
5318 }
5319 else {
5320 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005321 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 if (ch > maxch)
5323 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005324 if (kind != PyUnicode_1BYTE_KIND &&
5325 Py_UNICODE_IS_SURROGATE(ch))
5326 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 q += 4;
5329 } while (q <= last);
5330 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005331 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 }
5333
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005334 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005335 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005336 startinpos = ((const char *)q) - starts;
5337 endinpos = startinpos + 4;
5338 }
5339 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005342 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 startinpos = ((const char *)q) - starts;
5345 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 else {
5348 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005349 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 goto onError;
5351 q += 4;
5352 continue;
5353 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005354 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005355 startinpos = ((const char *)q) - starts;
5356 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005358
5359 /* The remaining input chars are ignored if the callback
5360 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005363 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367 }
5368
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378 Py_XDECREF(errorHandler);
5379 Py_XDECREF(exc);
5380 return NULL;
5381}
5382
5383PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384_PyUnicode_EncodeUTF32(PyObject *str,
5385 const char *errors,
5386 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 enum PyUnicode_Kind kind;
5389 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005391 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005392 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005393#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005397#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005400 PyObject *errorHandler = NULL;
5401 PyObject *exc = NULL;
5402 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005404 if (!PyUnicode_Check(str)) {
5405 PyErr_BadArgument();
5406 return NULL;
5407 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005408 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005409 return NULL;
5410 kind = PyUnicode_KIND(str);
5411 data = PyUnicode_DATA(str);
5412 len = PyUnicode_GET_LENGTH(str);
5413
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005415 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005416 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005417 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005418 if (v == NULL)
5419 return NULL;
5420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 /* output buffer is 4-bytes aligned */
5422 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005423 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005426 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005427 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005428
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005430 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005431 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 else
5434 encoding = "utf-32";
5435
5436 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5438 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439 }
5440
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 pos = 0;
5442 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005443 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444
5445 if (kind == PyUnicode_2BYTE_KIND) {
5446 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5447 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005448 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 else {
5450 assert(kind == PyUnicode_4BYTE_KIND);
5451 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5452 &out, native_ordering);
5453 }
5454 if (pos == len)
5455 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005457 rep = unicode_encode_call_errorhandler(
5458 errors, &errorHandler,
5459 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 if (!rep)
5462 goto error;
5463
5464 if (PyBytes_Check(rep)) {
5465 repsize = PyBytes_GET_SIZE(rep);
5466 if (repsize & 3) {
5467 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005468 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005469 "surrogates not allowed");
5470 goto error;
5471 }
5472 moreunits = repsize / 4;
5473 }
5474 else {
5475 assert(PyUnicode_Check(rep));
5476 if (PyUnicode_READY(rep) < 0)
5477 goto error;
5478 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5479 if (!PyUnicode_IS_ASCII(rep)) {
5480 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005481 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 "surrogates not allowed");
5483 goto error;
5484 }
5485 }
5486
5487 /* four bytes are reserved for each surrogate */
5488 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005489 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005490 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 /* integer overflow */
5492 PyErr_NoMemory();
5493 goto error;
5494 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005495 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005497 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 }
5499
5500 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005501 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005504 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5506 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 }
5508
5509 Py_CLEAR(rep);
5510 }
5511
5512 /* Cut back to size actually needed. This is necessary for, for example,
5513 encoding of a string containing isolated surrogates and the 'ignore'
5514 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005515 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 if (nsize != PyBytes_GET_SIZE(v))
5517 _PyBytes_Resize(&v, nsize);
5518 Py_XDECREF(errorHandler);
5519 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005520 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005522 error:
5523 Py_XDECREF(rep);
5524 Py_XDECREF(errorHandler);
5525 Py_XDECREF(exc);
5526 Py_XDECREF(v);
5527 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528}
5529
Alexander Belopolsky40018472011-02-26 01:02:56 +00005530PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5532 Py_ssize_t size,
5533 const char *errors,
5534 int byteorder)
5535{
5536 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005537 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005538 if (tmp == NULL)
5539 return NULL;
5540 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5541 Py_DECREF(tmp);
5542 return result;
5543}
5544
5545PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005546PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005547{
Victor Stinnerb960b342011-11-20 19:12:52 +01005548 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005549}
5550
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551/* --- UTF-16 Codec ------------------------------------------------------- */
5552
Tim Peters772747b2001-08-09 22:21:55 +00005553PyObject *
5554PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 Py_ssize_t size,
5556 const char *errors,
5557 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
Walter Dörwald69652032004-09-07 20:24:22 +00005559 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5560}
5561
5562PyObject *
5563PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 Py_ssize_t size,
5565 const char *errors,
5566 int *byteorder,
5567 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005570 Py_ssize_t startinpos;
5571 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005574 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005575 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005576 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 PyObject *errorHandler = NULL;
5578 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005579 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
Tim Peters772747b2001-08-09 22:21:55 +00005581 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005582 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
5584 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005585 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005587 /* Check for BOM marks (U+FEFF) in the input and adjust current
5588 byte order setting accordingly. In native mode, the leading BOM
5589 mark is skipped, in all other modes, it is copied to the output
5590 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005591 if (bo == 0 && size >= 2) {
5592 const Py_UCS4 bom = (q[1] << 8) | q[0];
5593 if (bom == 0xFEFF) {
5594 q += 2;
5595 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 else if (bom == 0xFFFE) {
5598 q += 2;
5599 bo = 1;
5600 }
5601 if (byteorder)
5602 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 if (q == e) {
5606 if (consumed)
5607 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005608 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005609 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610
Christian Heimes743e0cd2012-10-17 23:52:17 +02005611#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005613 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005614#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005617#endif
Tim Peters772747b2001-08-09 22:21:55 +00005618
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005620 character count normally. Error handler will take care of
5621 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005622 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005623 writer.min_length = (e - q + 1) / 2;
5624 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 while (1) {
5628 Py_UCS4 ch = 0;
5629 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005631 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005633 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005634 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005635 native_ordering);
5636 else
5637 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
5640 } else if (kind == PyUnicode_2BYTE_KIND) {
5641 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 native_ordering);
5644 } else {
5645 assert(kind == PyUnicode_4BYTE_KIND);
5646 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005647 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005649 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651
Antoine Pitrou63065d72012-05-15 23:48:04 +02005652 switch (ch)
5653 {
5654 case 0:
5655 /* remaining byte at the end? (size should be even) */
5656 if (q == e || consumed)
5657 goto End;
5658 errmsg = "truncated data";
5659 startinpos = ((const char *)q) - starts;
5660 endinpos = ((const char *)e) - starts;
5661 break;
5662 /* The remaining input chars are ignored if the callback
5663 chooses to skip the input */
5664 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005665 q -= 2;
5666 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005667 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005668 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005669 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005670 endinpos = ((const char *)e) - starts;
5671 break;
5672 case 2:
5673 errmsg = "illegal encoding";
5674 startinpos = ((const char *)q) - 2 - starts;
5675 endinpos = startinpos + 2;
5676 break;
5677 case 3:
5678 errmsg = "illegal UTF-16 surrogate";
5679 startinpos = ((const char *)q) - 4 - starts;
5680 endinpos = startinpos + 2;
5681 break;
5682 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005683 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005684 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 continue;
5686 }
5687
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005689 errors,
5690 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005691 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005692 &starts,
5693 (const char **)&e,
5694 &startinpos,
5695 &endinpos,
5696 &exc,
5697 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
5701
Antoine Pitrou63065d72012-05-15 23:48:04 +02005702End:
Walter Dörwald69652032004-09-07 20:24:22 +00005703 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 Py_XDECREF(errorHandler);
5707 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005708 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005711 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 Py_XDECREF(errorHandler);
5713 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 return NULL;
5715}
5716
Tim Peters772747b2001-08-09 22:21:55 +00005717PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005718_PyUnicode_EncodeUTF16(PyObject *str,
5719 const char *errors,
5720 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 enum PyUnicode_Kind kind;
5723 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005725 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005728#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005729 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005730#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005731 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005732#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 const char *encoding;
5734 Py_ssize_t nsize, pos;
5735 PyObject *errorHandler = NULL;
5736 PyObject *exc = NULL;
5737 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005738
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005739 if (!PyUnicode_Check(str)) {
5740 PyErr_BadArgument();
5741 return NULL;
5742 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005743 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 return NULL;
5745 kind = PyUnicode_KIND(str);
5746 data = PyUnicode_DATA(str);
5747 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005748
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 if (kind == PyUnicode_4BYTE_KIND) {
5751 const Py_UCS4 *in = (const Py_UCS4 *)data;
5752 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 while (in < end) {
5754 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
5757 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005758 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005761 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 nsize = len + pairs + (byteorder == 0);
5763 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005768 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005769 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005770 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005772 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005773 }
5774 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005775 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
Tim Peters772747b2001-08-09 22:21:55 +00005777
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 if (kind == PyUnicode_1BYTE_KIND) {
5779 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5780 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005781 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005782
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005783 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005785 }
5786 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005788 }
5789 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005791 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005792
5793 pos = 0;
5794 while (pos < len) {
5795 Py_ssize_t repsize, moreunits;
5796
5797 if (kind == PyUnicode_2BYTE_KIND) {
5798 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5799 &out, native_ordering);
5800 }
5801 else {
5802 assert(kind == PyUnicode_4BYTE_KIND);
5803 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5804 &out, native_ordering);
5805 }
5806 if (pos == len)
5807 break;
5808
5809 rep = unicode_encode_call_errorhandler(
5810 errors, &errorHandler,
5811 encoding, "surrogates not allowed",
5812 str, &exc, pos, pos + 1, &pos);
5813 if (!rep)
5814 goto error;
5815
5816 if (PyBytes_Check(rep)) {
5817 repsize = PyBytes_GET_SIZE(rep);
5818 if (repsize & 1) {
5819 raise_encode_exception(&exc, encoding,
5820 str, pos - 1, pos,
5821 "surrogates not allowed");
5822 goto error;
5823 }
5824 moreunits = repsize / 2;
5825 }
5826 else {
5827 assert(PyUnicode_Check(rep));
5828 if (PyUnicode_READY(rep) < 0)
5829 goto error;
5830 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5831 if (!PyUnicode_IS_ASCII(rep)) {
5832 raise_encode_exception(&exc, encoding,
5833 str, pos - 1, pos,
5834 "surrogates not allowed");
5835 goto error;
5836 }
5837 }
5838
5839 /* two bytes are reserved for each surrogate */
5840 if (moreunits > 1) {
5841 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005842 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 /* integer overflow */
5844 PyErr_NoMemory();
5845 goto error;
5846 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005847 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005848 goto error;
5849 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5850 }
5851
5852 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005853 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005854 out += moreunits;
5855 } else /* rep is unicode */ {
5856 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5857 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5858 &out, native_ordering);
5859 }
5860
5861 Py_CLEAR(rep);
5862 }
5863
5864 /* Cut back to size actually needed. This is necessary for, for example,
5865 encoding of a string containing isolated surrogates and the 'ignore' handler
5866 is used. */
5867 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5868 if (nsize != PyBytes_GET_SIZE(v))
5869 _PyBytes_Resize(&v, nsize);
5870 Py_XDECREF(errorHandler);
5871 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005872 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005873 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005874 error:
5875 Py_XDECREF(rep);
5876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
5878 Py_XDECREF(v);
5879 return NULL;
5880#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881}
5882
Alexander Belopolsky40018472011-02-26 01:02:56 +00005883PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005884PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5885 Py_ssize_t size,
5886 const char *errors,
5887 int byteorder)
5888{
5889 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005890 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (tmp == NULL)
5892 return NULL;
5893 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5894 Py_DECREF(tmp);
5895 return result;
5896}
5897
5898PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
5904/* --- Unicode Escape Codec ----------------------------------------------- */
5905
Fredrik Lundh06d12682001-01-24 07:59:11 +00005906static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005907
Alexander Belopolsky40018472011-02-26 01:02:56 +00005908PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005909_PyUnicode_DecodeUnicodeEscape(const char *s,
5910 Py_ssize_t size,
5911 const char *errors,
5912 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 PyObject *errorHandler = NULL;
5918 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005919
Eric V. Smith42454af2016-10-31 09:22:08 -04005920 // so we can remember if we've seen an invalid escape char or not
5921 *first_invalid_escape = NULL;
5922
Victor Stinner62ec3312016-09-06 17:04:34 -07005923 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005924 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005925 }
5926 /* Escaped strings will always be longer than the resulting
5927 Unicode string, so we start with size here and then reduce the
5928 length after conversion to the true value.
5929 (but if the error callback returns a long replacement string
5930 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005931 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005932 writer.min_length = size;
5933 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5934 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005935 }
5936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 end = s + size;
5938 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005939 unsigned char c = (unsigned char) *s++;
5940 Py_UCS4 ch;
5941 int count;
5942 Py_ssize_t startinpos;
5943 Py_ssize_t endinpos;
5944 const char *message;
5945
5946#define WRITE_ASCII_CHAR(ch) \
5947 do { \
5948 assert(ch <= 127); \
5949 assert(writer.pos < writer.size); \
5950 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5951 } while(0)
5952
5953#define WRITE_CHAR(ch) \
5954 do { \
5955 if (ch <= writer.maxchar) { \
5956 assert(writer.pos < writer.size); \
5957 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5958 } \
5959 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5960 goto onError; \
5961 } \
5962 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
5964 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 if (c != '\\') {
5966 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 continue;
5968 }
5969
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005972 if (s >= end) {
5973 message = "\\ at end of string";
5974 goto error;
5975 }
5976 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005979 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005982 case '\n': continue;
5983 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5984 case '\'': WRITE_ASCII_CHAR('\''); continue;
5985 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5986 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5989 case 't': WRITE_ASCII_CHAR('\t'); continue;
5990 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5991 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005992 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005994 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 case '0': case '1': case '2': case '3':
5999 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006001 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006002 ch = (ch<<3) + *s++ - '0';
6003 if (s < end && '0' <= *s && *s <= '7') {
6004 ch = (ch<<3) + *s++ - '0';
6005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 WRITE_CHAR(ch);
6008 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* hex escapes */
6011 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006013 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006014 message = "truncated \\xXX escape";
6015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006019 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006020 message = "truncated \\uXXXX escape";
6021 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006024 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006025 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006026 message = "truncated \\UXXXXXXXX escape";
6027 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006029 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006030 ch <<= 4;
6031 if (c >= '0' && c <= '9') {
6032 ch += c - '0';
6033 }
6034 else if (c >= 'a' && c <= 'f') {
6035 ch += c - ('a' - 10);
6036 }
6037 else if (c >= 'A' && c <= 'F') {
6038 ch += c - ('A' - 10);
6039 }
6040 else {
6041 break;
6042 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006043 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006045 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006046 }
6047
6048 /* when we get here, ch is a 32-bit unicode character */
6049 if (ch > MAX_UNICODE) {
6050 message = "illegal Unicode character";
6051 goto error;
6052 }
6053
6054 WRITE_CHAR(ch);
6055 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 if (ucnhash_CAPI == NULL) {
6060 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6062 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006063 if (ucnhash_CAPI == NULL) {
6064 PyErr_SetString(
6065 PyExc_UnicodeError,
6066 "\\N escapes not supported (can't load unicodedata module)"
6067 );
6068 goto onError;
6069 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006071
6072 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006073 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 const char *start = ++s;
6075 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 namelen = s - start;
6080 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006083 ch = 0xffffffff; /* in case 'getcode' messes up */
6084 if (namelen <= INT_MAX &&
6085 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6086 &ch, 0)) {
6087 assert(ch <= MAX_UNICODE);
6088 WRITE_CHAR(ch);
6089 continue;
6090 }
6091 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006092 }
6093 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006094 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006095
6096 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006097 if (*first_invalid_escape == NULL) {
6098 *first_invalid_escape = s-1; /* Back up one char, since we've
6099 already incremented s. */
6100 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006101 WRITE_ASCII_CHAR('\\');
6102 WRITE_CHAR(c);
6103 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105
6106 error:
6107 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006109 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006110 errors, &errorHandler,
6111 "unicodeescape", message,
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006114 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006116 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006117
6118#undef WRITE_ASCII_CHAR
6119#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006121
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006124 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006125
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 Py_XDECREF(errorHandler);
6129 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 return NULL;
6131}
6132
Eric V. Smith42454af2016-10-31 09:22:08 -04006133PyObject *
6134PyUnicode_DecodeUnicodeEscape(const char *s,
6135 Py_ssize_t size,
6136 const char *errors)
6137{
6138 const char *first_invalid_escape;
6139 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6140 &first_invalid_escape);
6141 if (result == NULL)
6142 return NULL;
6143 if (first_invalid_escape != NULL) {
6144 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6145 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006146 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006147 Py_DECREF(result);
6148 return NULL;
6149 }
6150 }
6151 return result;
6152}
6153
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006154/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
Alexander Belopolsky40018472011-02-26 01:02:56 +00006156PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006164 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Ezio Melottie7f90372012-10-05 03:33:31 +03006166 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006167 escape.
6168
Ezio Melottie7f90372012-10-05 03:33:31 +03006169 For UCS1 strings it's '\xxx', 4 bytes per source character.
6170 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6171 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006172 */
6173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 if (!PyUnicode_Check(unicode)) {
6175 PyErr_BadArgument();
6176 return NULL;
6177 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006178 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 }
Victor Stinner358af132015-10-12 22:36:57 +02006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 if (len == 0) {
6184 return PyBytes_FromStringAndSize(NULL, 0);
6185 }
6186
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 kind = PyUnicode_KIND(unicode);
6188 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6190 bytes, and 1 byte characters 4. */
6191 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006192 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 return PyErr_NoMemory();
6194 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006195 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 if (repr == NULL) {
6197 return NULL;
6198 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006199
Victor Stinner62ec3312016-09-06 17:04:34 -07006200 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006202 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006203
Victor Stinner62ec3312016-09-06 17:04:34 -07006204 /* U+0000-U+00ff range */
6205 if (ch < 0x100) {
6206 if (ch >= ' ' && ch < 127) {
6207 if (ch != '\\') {
6208 /* Copy printable US ASCII as-is */
6209 *p++ = (char) ch;
6210 }
6211 /* Escape backslashes */
6212 else {
6213 *p++ = '\\';
6214 *p++ = '\\';
6215 }
6216 }
Victor Stinner358af132015-10-12 22:36:57 +02006217
Victor Stinner62ec3312016-09-06 17:04:34 -07006218 /* Map special whitespace to '\t', \n', '\r' */
6219 else if (ch == '\t') {
6220 *p++ = '\\';
6221 *p++ = 't';
6222 }
6223 else if (ch == '\n') {
6224 *p++ = '\\';
6225 *p++ = 'n';
6226 }
6227 else if (ch == '\r') {
6228 *p++ = '\\';
6229 *p++ = 'r';
6230 }
6231
6232 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6233 else {
6234 *p++ = '\\';
6235 *p++ = 'x';
6236 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6237 *p++ = Py_hexdigits[ch & 0x000F];
6238 }
Tim Petersced69f82003-09-16 20:30:58 +00006239 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006240 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 *p++ = '\\';
6243 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006244 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6245 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6246 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6247 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6250 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006251
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 /* Make sure that the first two digits are zero */
6253 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006254 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 *p++ = 'U';
6256 *p++ = '0';
6257 *p++ = '0';
6258 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6261 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6262 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6263 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 assert(p - PyBytes_AS_STRING(repr) > 0);
6268 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6269 return NULL;
6270 }
6271 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272}
6273
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006275PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6276 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006279 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 }
6283
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006284 result = PyUnicode_AsUnicodeEscapeString(tmp);
6285 Py_DECREF(tmp);
6286 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287}
6288
6289/* --- Raw Unicode Escape Codec ------------------------------------------- */
6290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
6292PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006293 Py_ssize_t size,
6294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006297 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 PyObject *errorHandler = NULL;
6300 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006301
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006303 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 /* Escaped strings will always be longer than the resulting
6307 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308 length after conversion to the true value. (But decoding error
6309 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006310 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 writer.min_length = size;
6312 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6313 goto onError;
6314 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006315
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 end = s + size;
6317 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 unsigned char c = (unsigned char) *s++;
6319 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006320 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 Py_ssize_t startinpos;
6322 Py_ssize_t endinpos;
6323 const char *message;
6324
6325#define WRITE_CHAR(ch) \
6326 do { \
6327 if (ch <= writer.maxchar) { \
6328 assert(writer.pos < writer.size); \
6329 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6330 } \
6331 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6332 goto onError; \
6333 } \
6334 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006337 if (c != '\\' || s >= end) {
6338 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006341
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 c = (unsigned char) *s++;
6343 if (c == 'u') {
6344 count = 4;
6345 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 else if (c == 'U') {
6348 count = 8;
6349 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006350 }
6351 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 assert(writer.pos < writer.size);
6353 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6354 WRITE_CHAR(c);
6355 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006356 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 startinpos = s - starts - 2;
6358
6359 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6360 for (ch = 0; count && s < end; ++s, --count) {
6361 c = (unsigned char)*s;
6362 ch <<= 4;
6363 if (c >= '0' && c <= '9') {
6364 ch += c - '0';
6365 }
6366 else if (c >= 'a' && c <= 'f') {
6367 ch += c - ('a' - 10);
6368 }
6369 else if (c >= 'A' && c <= 'F') {
6370 ch += c - ('A' - 10);
6371 }
6372 else {
6373 break;
6374 }
6375 }
6376 if (!count) {
6377 if (ch <= MAX_UNICODE) {
6378 WRITE_CHAR(ch);
6379 continue;
6380 }
6381 message = "\\Uxxxxxxxx out of range";
6382 }
6383
6384 endinpos = s-starts;
6385 writer.min_length = end - s + writer.pos;
6386 if (unicode_decode_call_errorhandler_writer(
6387 errors, &errorHandler,
6388 "rawunicodeescape", message,
6389 &starts, &end, &startinpos, &endinpos, &exc, &s,
6390 &writer)) {
6391 goto onError;
6392 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006393 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006394
6395#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 Py_XDECREF(errorHandler);
6398 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006399 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006400
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 Py_XDECREF(errorHandler);
6404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006406
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407}
6408
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 int kind;
6417 void *data;
6418 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 if (!PyUnicode_Check(unicode)) {
6421 PyErr_BadArgument();
6422 return NULL;
6423 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 kind = PyUnicode_KIND(unicode);
6428 data = PyUnicode_DATA(unicode);
6429 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 if (kind == PyUnicode_1BYTE_KIND) {
6431 return PyBytes_FromStringAndSize(data, len);
6432 }
Victor Stinner0e368262011-11-10 20:12:49 +01006433
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6435 bytes, and 1 byte characters 4. */
6436 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 if (len > PY_SSIZE_T_MAX / expandsize) {
6439 return PyErr_NoMemory();
6440 }
6441 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6442 if (repr == NULL) {
6443 return NULL;
6444 }
6445 if (len == 0) {
6446 return repr;
6447 }
6448
6449 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 for (pos = 0; pos < len; pos++) {
6451 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006452
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6454 if (ch < 0x100) {
6455 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006456 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006457 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6464 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6467 else {
6468 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6469 *p++ = '\\';
6470 *p++ = 'U';
6471 *p++ = '0';
6472 *p++ = '0';
6473 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6477 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6478 *p++ = Py_hexdigits[ch & 15];
6479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006481
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 assert(p > PyBytes_AS_STRING(repr));
6483 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6484 return NULL;
6485 }
6486 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487}
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6491 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006494 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006496 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006497 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6498 Py_DECREF(tmp);
6499 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500}
6501
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006502/* --- Unicode Internal Codec ------------------------------------------- */
6503
Alexander Belopolsky40018472011-02-26 01:02:56 +00006504PyObject *
6505_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006506 Py_ssize_t size,
6507 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508{
6509 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 Py_ssize_t startinpos;
6511 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006512 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006513 const char *end;
6514 const char *reason;
6515 PyObject *errorHandler = NULL;
6516 PyObject *exc = NULL;
6517
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006518 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006519 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006520 1))
6521 return NULL;
6522
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006523 if (size < 0) {
6524 PyErr_BadInternalCall();
6525 return NULL;
6526 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006527 if (size == 0)
6528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529
Victor Stinner8f674cc2013-04-17 23:02:17 +02006530 _PyUnicodeWriter_Init(&writer);
6531 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6532 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006534 }
6535 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536
Victor Stinner8f674cc2013-04-17 23:02:17 +02006537 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006538 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006539 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006540 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006541 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006542 endinpos = end-starts;
6543 reason = "truncated input";
6544 goto error;
6545 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546 /* We copy the raw representation one byte at a time because the
6547 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006548 ((char *) &uch)[0] = s[0];
6549 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006550#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006551 ((char *) &uch)[2] = s[2];
6552 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006553#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006554 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 /* We have to sanity check the raw data, otherwise doom looms for
6557 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006558 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006559 endinpos = s - starts + Py_UNICODE_SIZE;
6560 reason = "illegal code point (> 0x10FFFF)";
6561 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006563#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006564 s += Py_UNICODE_SIZE;
6565#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006566 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006568 Py_UNICODE uch2;
6569 ((char *) &uch2)[0] = s[0];
6570 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006571 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572 {
Victor Stinner551ac952011-11-29 22:58:13 +01006573 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006574 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575 }
6576 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577#endif
6578
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006579 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006580 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006581 continue;
6582
6583 error:
6584 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006585 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006586 errors, &errorHandler,
6587 "unicode_internal", reason,
6588 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006589 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006590 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591 }
6592
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006593 Py_XDECREF(errorHandler);
6594 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006595 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006596
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006598 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
6601 return NULL;
6602}
6603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604/* --- Latin-1 Codec ------------------------------------------------------ */
6605
Alexander Belopolsky40018472011-02-26 01:02:56 +00006606PyObject *
6607PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006608 Py_ssize_t size,
6609 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006612 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006616static void
6617make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006618 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006619 PyObject *unicode,
6620 Py_ssize_t startpos, Py_ssize_t endpos,
6621 const char *reason)
6622{
6623 if (*exceptionObject == NULL) {
6624 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006625 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006626 encoding, unicode, startpos, endpos, reason);
6627 }
6628 else {
6629 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6630 goto onError;
6631 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6632 goto onError;
6633 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6634 goto onError;
6635 return;
6636 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006637 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006638 }
6639}
6640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642static void
6643raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006645 PyObject *unicode,
6646 Py_ssize_t startpos, Py_ssize_t endpos,
6647 const char *reason)
6648{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006649 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006650 encoding, unicode, startpos, endpos, reason);
6651 if (*exceptionObject != NULL)
6652 PyCodec_StrictErrors(*exceptionObject);
6653}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654
6655/* error handling callback helper:
6656 build arguments, call the callback and check the arguments,
6657 put the result into newpos and return the replacement string, which
6658 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006659static PyObject *
6660unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006661 PyObject **errorHandler,
6662 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006664 Py_ssize_t startpos, Py_ssize_t endpos,
6665 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006667 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 PyObject *restuple;
6670 PyObject *resunicode;
6671
6672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 }
6677
Benjamin Petersonbac79492012-01-14 13:34:47 -05006678 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 return NULL;
6680 len = PyUnicode_GET_LENGTH(unicode);
6681
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006682 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006687 restuple = PyObject_CallFunctionObjArgs(
6688 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006692 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 Py_DECREF(restuple);
6694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006696 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 &resunicode, newpos)) {
6698 Py_DECREF(restuple);
6699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006701 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6702 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6703 Py_DECREF(restuple);
6704 return NULL;
6705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707 *newpos = len + *newpos;
6708 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006709 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 Py_DECREF(restuple);
6711 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 Py_INCREF(resunicode);
6714 Py_DECREF(restuple);
6715 return resunicode;
6716}
6717
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006720 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006721 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 /* input state */
6724 Py_ssize_t pos=0, size;
6725 int kind;
6726 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 /* pointer into the output */
6728 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006729 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6730 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006731 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006733 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006734 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006735 /* output object */
6736 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737
Benjamin Petersonbac79492012-01-14 13:34:47 -05006738 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 return NULL;
6740 size = PyUnicode_GET_LENGTH(unicode);
6741 kind = PyUnicode_KIND(unicode);
6742 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006743 /* allocate enough for a simple encoding without
6744 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006745 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006746 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006747
6748 _PyBytesWriter_Init(&writer);
6749 str = _PyBytesWriter_Alloc(&writer, size);
6750 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006751 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006754 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006757 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006759 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006761 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006763 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006766 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006768
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006769 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006771
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006772 /* Only overallocate the buffer if it's not the last write */
6773 writer.overallocate = (collend < size);
6774
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006776 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006777 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006778
6779 switch (error_handler) {
6780 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006781 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006783
6784 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006785 memset(str, '?', collend - collstart);
6786 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006787 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006788 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 break;
Victor Stinner50149202015-09-22 00:26:54 +02006791
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006792 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006793 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006794 writer.min_size -= (collend - collstart);
6795 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006796 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006797 if (str == NULL)
6798 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006799 pos = collend;
6800 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006801
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006803 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006804 writer.min_size -= (collend - collstart);
6805 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006806 unicode, collstart, collend);
6807 if (str == NULL)
6808 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006809 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 break;
Victor Stinner50149202015-09-22 00:26:54 +02006811
Victor Stinnerc3713e92015-09-29 12:32:13 +02006812 case _Py_ERROR_SURROGATEESCAPE:
6813 for (i = collstart; i < collend; ++i) {
6814 ch = PyUnicode_READ(kind, data, i);
6815 if (ch < 0xdc80 || 0xdcff < ch) {
6816 /* Not a UTF-8b surrogate */
6817 break;
6818 }
6819 *str++ = (char)(ch - 0xdc00);
6820 ++pos;
6821 }
6822 if (i >= collend)
6823 break;
6824 collstart = pos;
6825 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006826 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006827
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006829 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6830 encoding, reason, unicode, &exc,
6831 collstart, collend, &newpos);
6832 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006834
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006835 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006836 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006837
Victor Stinner6bd525b2015-10-09 13:10:05 +02006838 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006839 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006840 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 PyBytes_AS_STRING(rep),
6842 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006843 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 else {
6845 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006846
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006849
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006850 if (limit == 256 ?
6851 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6852 !PyUnicode_IS_ASCII(rep))
6853 {
6854 /* Not all characters are smaller than limit */
6855 raise_encode_exception(&exc, encoding, unicode,
6856 collstart, collend, reason);
6857 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006859 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6860 str = _PyBytesWriter_WriteBytes(&writer, str,
6861 PyUnicode_DATA(rep),
6862 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006864 if (str == NULL)
6865 goto onError;
6866
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006868 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006869 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006870
6871 /* If overallocation was disabled, ensure that it was the last
6872 write. Otherwise, we missed an optimization */
6873 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006874 }
6875 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006876
Victor Stinner50149202015-09-22 00:26:54 +02006877 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006879 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006880
6881 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006882 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006883 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006884 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006885 Py_XDECREF(exc);
6886 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887}
6888
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890PyObject *
6891PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006892 Py_ssize_t size,
6893 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006896 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006897 if (unicode == NULL)
6898 return NULL;
6899 result = unicode_encode_ucs1(unicode, errors, 256);
6900 Py_DECREF(unicode);
6901 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902}
6903
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
6907 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 PyErr_BadArgument();
6909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006911 if (PyUnicode_READY(unicode) == -1)
6912 return NULL;
6913 /* Fast path: if it is a one-byte string, construct
6914 bytes object directly. */
6915 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6916 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6917 PyUnicode_GET_LENGTH(unicode));
6918 /* Non-Latin-1 characters present. Defer to above function to
6919 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006920 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006921}
6922
6923PyObject*
6924PyUnicode_AsLatin1String(PyObject *unicode)
6925{
6926 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927}
6928
6929/* --- 7-bit ASCII Codec -------------------------------------------------- */
6930
Alexander Belopolsky40018472011-02-26 01:02:56 +00006931PyObject *
6932PyUnicode_DecodeASCII(const char *s,
6933 Py_ssize_t size,
6934 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006937 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006938 int kind;
6939 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006940 Py_ssize_t startinpos;
6941 Py_ssize_t endinpos;
6942 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006946 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006949 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006952 if (size == 1 && (unsigned char)s[0] < 128)
6953 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006954
Victor Stinner8f674cc2013-04-17 23:02:17 +02006955 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006956 writer.min_length = size;
6957 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006958 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006962 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006963 writer.pos = outpos;
6964 if (writer.pos == size)
6965 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006966
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006967 s += writer.pos;
6968 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006970 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 PyUnicode_WRITE(kind, data, writer.pos, c);
6973 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006977
6978 /* byte outsize range 0x00..0x7f: call the error handler */
6979
6980 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006981 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006982
6983 switch (error_handler)
6984 {
6985 case _Py_ERROR_REPLACE:
6986 case _Py_ERROR_SURROGATEESCAPE:
6987 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006988 but we may switch to UCS2 at the first write */
6989 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6990 goto onError;
6991 kind = writer.kind;
6992 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006993
6994 if (error_handler == _Py_ERROR_REPLACE)
6995 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6996 else
6997 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6998 writer.pos++;
6999 ++s;
7000 break;
7001
7002 case _Py_ERROR_IGNORE:
7003 ++s;
7004 break;
7005
7006 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 startinpos = s-starts;
7008 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007010 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 "ascii", "ordinal not in range(128)",
7012 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 kind = writer.kind;
7016 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007021 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007022
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007025 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 return NULL;
7028}
7029
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007030/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007031PyObject *
7032PyUnicode_EncodeASCII(const Py_UNICODE *p,
7033 Py_ssize_t size,
7034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007037 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007038 if (unicode == NULL)
7039 return NULL;
7040 result = unicode_encode_ucs1(unicode, errors, 128);
7041 Py_DECREF(unicode);
7042 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047{
7048 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 PyErr_BadArgument();
7050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007052 if (PyUnicode_READY(unicode) == -1)
7053 return NULL;
7054 /* Fast path: if it is an ASCII-only string, construct bytes object
7055 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007056 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007057 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7058 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007059 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007060}
7061
7062PyObject *
7063PyUnicode_AsASCIIString(PyObject *unicode)
7064{
7065 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066}
7067
Steve Dowercc16be82016-09-08 10:35:16 -07007068#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007070/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007071
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007072#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073#define NEED_RETRY
7074#endif
7075
Victor Stinner3a50e702011-10-18 21:21:00 +02007076#ifndef WC_ERR_INVALID_CHARS
7077# define WC_ERR_INVALID_CHARS 0x0080
7078#endif
7079
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007080static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007081code_page_name(UINT code_page, PyObject **obj)
7082{
7083 *obj = NULL;
7084 if (code_page == CP_ACP)
7085 return "mbcs";
7086 if (code_page == CP_UTF7)
7087 return "CP_UTF7";
7088 if (code_page == CP_UTF8)
7089 return "CP_UTF8";
7090
7091 *obj = PyBytes_FromFormat("cp%u", code_page);
7092 if (*obj == NULL)
7093 return NULL;
7094 return PyBytes_AS_STRING(*obj);
7095}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096
Victor Stinner3a50e702011-10-18 21:21:00 +02007097static DWORD
7098decode_code_page_flags(UINT code_page)
7099{
7100 if (code_page == CP_UTF7) {
7101 /* The CP_UTF7 decoder only supports flags=0 */
7102 return 0;
7103 }
7104 else
7105 return MB_ERR_INVALID_CHARS;
7106}
7107
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 * Decode a byte string from a Windows code page into unicode object in strict
7110 * mode.
7111 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007112 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7113 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007115static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007116decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 const char *in,
7119 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120{
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007122 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007123 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
7125 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 assert(insize > 0);
7127 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7128 if (outsize <= 0)
7129 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130
7131 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007133 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007134 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 if (*v == NULL)
7136 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138 }
7139 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007142 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145 }
7146
7147 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7149 if (outsize <= 0)
7150 goto error;
7151 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153error:
7154 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7155 return -2;
7156 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007157 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158}
7159
Victor Stinner3a50e702011-10-18 21:21:00 +02007160/*
7161 * Decode a byte string from a code page into unicode object with an error
7162 * handler.
7163 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007164 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 * UnicodeDecodeError exception and returns -1 on error.
7166 */
7167static int
7168decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007169 PyObject **v,
7170 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007171 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007172{
7173 const char *startin = in;
7174 const char *endin = in + size;
7175 const DWORD flags = decode_code_page_flags(code_page);
7176 /* Ideally, we should get reason from FormatMessage. This is the Windows
7177 2000 English version of the message. */
7178 const char *reason = "No mapping for the Unicode character exists "
7179 "in the target code page.";
7180 /* each step cannot decode more than 1 character, but a character can be
7181 represented as a surrogate pair */
7182 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007183 int insize;
7184 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 PyObject *errorHandler = NULL;
7186 PyObject *exc = NULL;
7187 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007188 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 DWORD err;
7190 int ret = -1;
7191
7192 assert(size > 0);
7193
7194 encoding = code_page_name(code_page, &encoding_obj);
7195 if (encoding == NULL)
7196 return -1;
7197
Victor Stinner7d00cc12014-03-17 23:08:06 +01007198 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7200 UnicodeDecodeError. */
7201 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7202 if (exc != NULL) {
7203 PyCodec_StrictErrors(exc);
7204 Py_CLEAR(exc);
7205 }
7206 goto error;
7207 }
7208
7209 if (*v == NULL) {
7210 /* Create unicode object */
7211 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7212 PyErr_NoMemory();
7213 goto error;
7214 }
Victor Stinnerab595942011-12-17 04:59:06 +01007215 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 if (*v == NULL)
7218 goto error;
7219 startout = PyUnicode_AS_UNICODE(*v);
7220 }
7221 else {
7222 /* Extend unicode object */
7223 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7224 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7225 PyErr_NoMemory();
7226 goto error;
7227 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007228 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 goto error;
7230 startout = PyUnicode_AS_UNICODE(*v) + n;
7231 }
7232
7233 /* Decode the byte string character per character */
7234 out = startout;
7235 while (in < endin)
7236 {
7237 /* Decode a character */
7238 insize = 1;
7239 do
7240 {
7241 outsize = MultiByteToWideChar(code_page, flags,
7242 in, insize,
7243 buffer, Py_ARRAY_LENGTH(buffer));
7244 if (outsize > 0)
7245 break;
7246 err = GetLastError();
7247 if (err != ERROR_NO_UNICODE_TRANSLATION
7248 && err != ERROR_INSUFFICIENT_BUFFER)
7249 {
7250 PyErr_SetFromWindowsErr(0);
7251 goto error;
7252 }
7253 insize++;
7254 }
7255 /* 4=maximum length of a UTF-8 sequence */
7256 while (insize <= 4 && (in + insize) <= endin);
7257
7258 if (outsize <= 0) {
7259 Py_ssize_t startinpos, endinpos, outpos;
7260
Victor Stinner7d00cc12014-03-17 23:08:06 +01007261 /* last character in partial decode? */
7262 if (in + insize >= endin && !final)
7263 break;
7264
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 startinpos = in - startin;
7266 endinpos = startinpos + 1;
7267 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007268 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 errors, &errorHandler,
7270 encoding, reason,
7271 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007272 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 {
7274 goto error;
7275 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007276 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 }
7278 else {
7279 in += insize;
7280 memcpy(out, buffer, outsize * sizeof(wchar_t));
7281 out += outsize;
7282 }
7283 }
7284
7285 /* write a NUL character at the end */
7286 *out = 0;
7287
7288 /* Extend unicode object */
7289 outsize = out - startout;
7290 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007291 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007293 /* (in - startin) <= size and size is an int */
7294 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007295
7296error:
7297 Py_XDECREF(encoding_obj);
7298 Py_XDECREF(errorHandler);
7299 Py_XDECREF(exc);
7300 return ret;
7301}
7302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303static PyObject *
7304decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007305 const char *s, Py_ssize_t size,
7306 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007307{
Victor Stinner76a31a62011-11-04 00:05:13 +01007308 PyObject *v = NULL;
7309 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 if (code_page < 0) {
7312 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7313 return NULL;
7314 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007315 if (size < 0) {
7316 PyErr_BadInternalCall();
7317 return NULL;
7318 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007319
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 do
7324 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 if (size > INT_MAX) {
7327 chunk_size = INT_MAX;
7328 final = 0;
7329 done = 0;
7330 }
7331 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 {
7334 chunk_size = (int)size;
7335 final = (consumed == NULL);
7336 done = 1;
7337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 if (chunk_size == 0 && done) {
7340 if (v != NULL)
7341 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007342 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
Victor Stinner76a31a62011-11-04 00:05:13 +01007345 converted = decode_code_page_strict(code_page, &v,
7346 s, chunk_size);
7347 if (converted == -2)
7348 converted = decode_code_page_errors(code_page, &v,
7349 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007350 errors, final);
7351 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007352
7353 if (converted < 0) {
7354 Py_XDECREF(v);
7355 return NULL;
7356 }
7357
7358 if (consumed)
7359 *consumed += converted;
7360
7361 s += converted;
7362 size -= converted;
7363 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007364
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007365 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366}
7367
Alexander Belopolsky40018472011-02-26 01:02:56 +00007368PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007369PyUnicode_DecodeCodePageStateful(int code_page,
7370 const char *s,
7371 Py_ssize_t size,
7372 const char *errors,
7373 Py_ssize_t *consumed)
7374{
7375 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7376}
7377
7378PyObject *
7379PyUnicode_DecodeMBCSStateful(const char *s,
7380 Py_ssize_t size,
7381 const char *errors,
7382 Py_ssize_t *consumed)
7383{
7384 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7385}
7386
7387PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007388PyUnicode_DecodeMBCS(const char *s,
7389 Py_ssize_t size,
7390 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007391{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7393}
7394
Victor Stinner3a50e702011-10-18 21:21:00 +02007395static DWORD
7396encode_code_page_flags(UINT code_page, const char *errors)
7397{
7398 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007399 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 }
7401 else if (code_page == CP_UTF7) {
7402 /* CP_UTF7 only supports flags=0 */
7403 return 0;
7404 }
7405 else {
7406 if (errors != NULL && strcmp(errors, "replace") == 0)
7407 return 0;
7408 else
7409 return WC_NO_BEST_FIT_CHARS;
7410 }
7411}
7412
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 * Encode a Unicode string to a Windows code page into a byte string in strict
7415 * mode.
7416 *
7417 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007418 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007420static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007421encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007422 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424{
Victor Stinner554f3f02010-06-16 23:33:54 +00007425 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 BOOL *pusedDefaultChar = &usedDefaultChar;
7427 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007428 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007429 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 const DWORD flags = encode_code_page_flags(code_page, NULL);
7431 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432 /* Create a substring so that we can get the UTF-16 representation
7433 of just the slice under consideration. */
7434 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435
Martin v. Löwis3d325192011-11-04 18:23:06 +01007436 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007437
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007439 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007441 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007442
Victor Stinner2fc507f2011-11-04 20:06:39 +01007443 substring = PyUnicode_Substring(unicode, offset, offset+len);
7444 if (substring == NULL)
7445 return -1;
7446 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7447 if (p == NULL) {
7448 Py_DECREF(substring);
7449 return -1;
7450 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007451 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007452
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007453 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007455 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 NULL, 0,
7457 NULL, pusedDefaultChar);
7458 if (outsize <= 0)
7459 goto error;
7460 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 if (pusedDefaultChar && *pusedDefaultChar) {
7462 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007465
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 if (*outbytes == NULL) {
7470 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474 }
7475 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 const Py_ssize_t n = PyBytes_Size(*outbytes);
7478 if (outsize > PY_SSIZE_T_MAX - n) {
7479 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7484 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488 }
7489
7490 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007492 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 out, outsize,
7494 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 if (outsize <= 0)
7497 goto error;
7498 if (pusedDefaultChar && *pusedDefaultChar)
7499 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7505 return -2;
7506 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007507 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007508}
7509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007511 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 * error handler.
7513 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007514 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 * -1 on other error.
7516 */
7517static int
7518encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007519 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007520 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007521{
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007523 Py_ssize_t pos = unicode_offset;
7524 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 /* Ideally, we should get reason from FormatMessage. This is the Windows
7526 2000 English version of the message. */
7527 const char *reason = "invalid character";
7528 /* 4=maximum length of a UTF-8 sequence */
7529 char buffer[4];
7530 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7531 Py_ssize_t outsize;
7532 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 PyObject *errorHandler = NULL;
7534 PyObject *exc = NULL;
7535 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007536 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007537 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 PyObject *rep;
7539 int ret = -1;
7540
7541 assert(insize > 0);
7542
7543 encoding = code_page_name(code_page, &encoding_obj);
7544 if (encoding == NULL)
7545 return -1;
7546
7547 if (errors == NULL || strcmp(errors, "strict") == 0) {
7548 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7549 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007550 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 if (exc != NULL) {
7552 PyCodec_StrictErrors(exc);
7553 Py_DECREF(exc);
7554 }
7555 Py_XDECREF(encoding_obj);
7556 return -1;
7557 }
7558
7559 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7560 pusedDefaultChar = &usedDefaultChar;
7561 else
7562 pusedDefaultChar = NULL;
7563
7564 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7565 PyErr_NoMemory();
7566 goto error;
7567 }
7568 outsize = insize * Py_ARRAY_LENGTH(buffer);
7569
7570 if (*outbytes == NULL) {
7571 /* Create string object */
7572 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7573 if (*outbytes == NULL)
7574 goto error;
7575 out = PyBytes_AS_STRING(*outbytes);
7576 }
7577 else {
7578 /* Extend string object */
7579 Py_ssize_t n = PyBytes_Size(*outbytes);
7580 if (n > PY_SSIZE_T_MAX - outsize) {
7581 PyErr_NoMemory();
7582 goto error;
7583 }
7584 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7585 goto error;
7586 out = PyBytes_AS_STRING(*outbytes) + n;
7587 }
7588
7589 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007590 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007592 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7593 wchar_t chars[2];
7594 int charsize;
7595 if (ch < 0x10000) {
7596 chars[0] = (wchar_t)ch;
7597 charsize = 1;
7598 }
7599 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007600 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7601 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007602 charsize = 2;
7603 }
7604
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 buffer, Py_ARRAY_LENGTH(buffer),
7608 NULL, pusedDefaultChar);
7609 if (outsize > 0) {
7610 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7611 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007612 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 memcpy(out, buffer, outsize);
7614 out += outsize;
7615 continue;
7616 }
7617 }
7618 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7619 PyErr_SetFromWindowsErr(0);
7620 goto error;
7621 }
7622
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 rep = unicode_encode_call_errorhandler(
7624 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007625 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007626 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 if (rep == NULL)
7628 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007630
7631 if (PyBytes_Check(rep)) {
7632 outsize = PyBytes_GET_SIZE(rep);
7633 if (outsize != 1) {
7634 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7635 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7636 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7637 Py_DECREF(rep);
7638 goto error;
7639 }
7640 out = PyBytes_AS_STRING(*outbytes) + offset;
7641 }
7642 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7643 out += outsize;
7644 }
7645 else {
7646 Py_ssize_t i;
7647 enum PyUnicode_Kind kind;
7648 void *data;
7649
Benjamin Petersonbac79492012-01-14 13:34:47 -05007650 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 Py_DECREF(rep);
7652 goto error;
7653 }
7654
7655 outsize = PyUnicode_GET_LENGTH(rep);
7656 if (outsize != 1) {
7657 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7658 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7659 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7660 Py_DECREF(rep);
7661 goto error;
7662 }
7663 out = PyBytes_AS_STRING(*outbytes) + offset;
7664 }
7665 kind = PyUnicode_KIND(rep);
7666 data = PyUnicode_DATA(rep);
7667 for (i=0; i < outsize; i++) {
7668 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7669 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007670 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007671 encoding, unicode,
7672 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 "unable to encode error handler result to ASCII");
7674 Py_DECREF(rep);
7675 goto error;
7676 }
7677 *out = (unsigned char)ch;
7678 out++;
7679 }
7680 }
7681 Py_DECREF(rep);
7682 }
7683 /* write a NUL byte */
7684 *out = 0;
7685 outsize = out - PyBytes_AS_STRING(*outbytes);
7686 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7687 if (_PyBytes_Resize(outbytes, outsize) < 0)
7688 goto error;
7689 ret = 0;
7690
7691error:
7692 Py_XDECREF(encoding_obj);
7693 Py_XDECREF(errorHandler);
7694 Py_XDECREF(exc);
7695 return ret;
7696}
7697
Victor Stinner3a50e702011-10-18 21:21:00 +02007698static PyObject *
7699encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007700 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007701 const char *errors)
7702{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007703 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007705 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007706 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007707
Victor Stinner29dacf22015-01-26 16:41:32 +01007708 if (!PyUnicode_Check(unicode)) {
7709 PyErr_BadArgument();
7710 return NULL;
7711 }
7712
Benjamin Petersonbac79492012-01-14 13:34:47 -05007713 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007714 return NULL;
7715 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007716
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 if (code_page < 0) {
7718 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7719 return NULL;
7720 }
7721
Martin v. Löwis3d325192011-11-04 18:23:06 +01007722 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007723 return PyBytes_FromStringAndSize(NULL, 0);
7724
Victor Stinner7581cef2011-11-03 22:32:33 +01007725 offset = 0;
7726 do
7727 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007728#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007729 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 chunks. */
7731 if (len > INT_MAX/2) {
7732 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 done = 0;
7734 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007735 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007736#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007739 done = 1;
7740 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007741
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 errors);
7745 if (ret == -2)
7746 ret = encode_code_page_errors(code_page, &outbytes,
7747 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007749 if (ret < 0) {
7750 Py_XDECREF(outbytes);
7751 return NULL;
7752 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007753
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007755 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007756 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757
Victor Stinner3a50e702011-10-18 21:21:00 +02007758 return outbytes;
7759}
7760
7761PyObject *
7762PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7763 Py_ssize_t size,
7764 const char *errors)
7765{
Victor Stinner7581cef2011-11-03 22:32:33 +01007766 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007767 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007768 if (unicode == NULL)
7769 return NULL;
7770 res = encode_code_page(CP_ACP, unicode, errors);
7771 Py_DECREF(unicode);
7772 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007773}
7774
7775PyObject *
7776PyUnicode_EncodeCodePage(int code_page,
7777 PyObject *unicode,
7778 const char *errors)
7779{
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007781}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007782
Alexander Belopolsky40018472011-02-26 01:02:56 +00007783PyObject *
7784PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007785{
Victor Stinner7581cef2011-11-03 22:32:33 +01007786 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007787}
7788
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007789#undef NEED_RETRY
7790
Steve Dowercc16be82016-09-08 10:35:16 -07007791#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793/* --- Character Mapping Codec -------------------------------------------- */
7794
Victor Stinnerfb161b12013-04-18 01:44:27 +02007795static int
7796charmap_decode_string(const char *s,
7797 Py_ssize_t size,
7798 PyObject *mapping,
7799 const char *errors,
7800 _PyUnicodeWriter *writer)
7801{
7802 const char *starts = s;
7803 const char *e;
7804 Py_ssize_t startinpos, endinpos;
7805 PyObject *errorHandler = NULL, *exc = NULL;
7806 Py_ssize_t maplen;
7807 enum PyUnicode_Kind mapkind;
7808 void *mapdata;
7809 Py_UCS4 x;
7810 unsigned char ch;
7811
7812 if (PyUnicode_READY(mapping) == -1)
7813 return -1;
7814
7815 maplen = PyUnicode_GET_LENGTH(mapping);
7816 mapdata = PyUnicode_DATA(mapping);
7817 mapkind = PyUnicode_KIND(mapping);
7818
7819 e = s + size;
7820
7821 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7822 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7823 * is disabled in encoding aliases, latin1 is preferred because
7824 * its implementation is faster. */
7825 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7826 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7827 Py_UCS4 maxchar = writer->maxchar;
7828
7829 assert (writer->kind == PyUnicode_1BYTE_KIND);
7830 while (s < e) {
7831 ch = *s;
7832 x = mapdata_ucs1[ch];
7833 if (x > maxchar) {
7834 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7835 goto onError;
7836 maxchar = writer->maxchar;
7837 outdata = (Py_UCS1 *)writer->data;
7838 }
7839 outdata[writer->pos] = x;
7840 writer->pos++;
7841 ++s;
7842 }
7843 return 0;
7844 }
7845
7846 while (s < e) {
7847 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7848 enum PyUnicode_Kind outkind = writer->kind;
7849 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7850 if (outkind == PyUnicode_1BYTE_KIND) {
7851 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7852 Py_UCS4 maxchar = writer->maxchar;
7853 while (s < e) {
7854 ch = *s;
7855 x = mapdata_ucs2[ch];
7856 if (x > maxchar)
7857 goto Error;
7858 outdata[writer->pos] = x;
7859 writer->pos++;
7860 ++s;
7861 }
7862 break;
7863 }
7864 else if (outkind == PyUnicode_2BYTE_KIND) {
7865 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7866 while (s < e) {
7867 ch = *s;
7868 x = mapdata_ucs2[ch];
7869 if (x == 0xFFFE)
7870 goto Error;
7871 outdata[writer->pos] = x;
7872 writer->pos++;
7873 ++s;
7874 }
7875 break;
7876 }
7877 }
7878 ch = *s;
7879
7880 if (ch < maplen)
7881 x = PyUnicode_READ(mapkind, mapdata, ch);
7882 else
7883 x = 0xfffe; /* invalid value */
7884Error:
7885 if (x == 0xfffe)
7886 {
7887 /* undefined mapping */
7888 startinpos = s-starts;
7889 endinpos = startinpos+1;
7890 if (unicode_decode_call_errorhandler_writer(
7891 errors, &errorHandler,
7892 "charmap", "character maps to <undefined>",
7893 &starts, &e, &startinpos, &endinpos, &exc, &s,
7894 writer)) {
7895 goto onError;
7896 }
7897 continue;
7898 }
7899
7900 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7901 goto onError;
7902 ++s;
7903 }
7904 Py_XDECREF(errorHandler);
7905 Py_XDECREF(exc);
7906 return 0;
7907
7908onError:
7909 Py_XDECREF(errorHandler);
7910 Py_XDECREF(exc);
7911 return -1;
7912}
7913
7914static int
7915charmap_decode_mapping(const char *s,
7916 Py_ssize_t size,
7917 PyObject *mapping,
7918 const char *errors,
7919 _PyUnicodeWriter *writer)
7920{
7921 const char *starts = s;
7922 const char *e;
7923 Py_ssize_t startinpos, endinpos;
7924 PyObject *errorHandler = NULL, *exc = NULL;
7925 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007926 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007927
7928 e = s + size;
7929
7930 while (s < e) {
7931 ch = *s;
7932
7933 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7934 key = PyLong_FromLong((long)ch);
7935 if (key == NULL)
7936 goto onError;
7937
7938 item = PyObject_GetItem(mapping, key);
7939 Py_DECREF(key);
7940 if (item == NULL) {
7941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7942 /* No mapping found means: mapping is undefined. */
7943 PyErr_Clear();
7944 goto Undefined;
7945 } else
7946 goto onError;
7947 }
7948
7949 /* Apply mapping */
7950 if (item == Py_None)
7951 goto Undefined;
7952 if (PyLong_Check(item)) {
7953 long value = PyLong_AS_LONG(item);
7954 if (value == 0xFFFE)
7955 goto Undefined;
7956 if (value < 0 || value > MAX_UNICODE) {
7957 PyErr_Format(PyExc_TypeError,
7958 "character mapping must be in range(0x%lx)",
7959 (unsigned long)MAX_UNICODE + 1);
7960 goto onError;
7961 }
7962
7963 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7964 goto onError;
7965 }
7966 else if (PyUnicode_Check(item)) {
7967 if (PyUnicode_READY(item) == -1)
7968 goto onError;
7969 if (PyUnicode_GET_LENGTH(item) == 1) {
7970 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7971 if (value == 0xFFFE)
7972 goto Undefined;
7973 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7974 goto onError;
7975 }
7976 else {
7977 writer->overallocate = 1;
7978 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7979 goto onError;
7980 }
7981 }
7982 else {
7983 /* wrong return value */
7984 PyErr_SetString(PyExc_TypeError,
7985 "character mapping must return integer, None or str");
7986 goto onError;
7987 }
7988 Py_CLEAR(item);
7989 ++s;
7990 continue;
7991
7992Undefined:
7993 /* undefined mapping */
7994 Py_CLEAR(item);
7995 startinpos = s-starts;
7996 endinpos = startinpos+1;
7997 if (unicode_decode_call_errorhandler_writer(
7998 errors, &errorHandler,
7999 "charmap", "character maps to <undefined>",
8000 &starts, &e, &startinpos, &endinpos, &exc, &s,
8001 writer)) {
8002 goto onError;
8003 }
8004 }
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return 0;
8008
8009onError:
8010 Py_XDECREF(item);
8011 Py_XDECREF(errorHandler);
8012 Py_XDECREF(exc);
8013 return -1;
8014}
8015
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016PyObject *
8017PyUnicode_DecodeCharmap(const char *s,
8018 Py_ssize_t size,
8019 PyObject *mapping,
8020 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 /* Default to Latin-1 */
8025 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008029 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008030 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008031 writer.min_length = size;
8032 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008034
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008035 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008036 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8037 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008038 }
8039 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008040 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8041 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008043 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008044
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008046 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 return NULL;
8048}
8049
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050/* Charmap encoding: the lookup table */
8051
Alexander Belopolsky40018472011-02-26 01:02:56 +00008052struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 PyObject_HEAD
8054 unsigned char level1[32];
8055 int count2, count3;
8056 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057};
8058
8059static PyObject*
8060encoding_map_size(PyObject *obj, PyObject* args)
8061{
8062 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065}
8066
8067static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 PyDoc_STR("Return the size (in bytes) of this object") },
8070 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071};
8072
8073static void
8074encoding_map_dealloc(PyObject* o)
8075{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008076 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077}
8078
8079static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 "EncodingMap", /*tp_name*/
8082 sizeof(struct encoding_map), /*tp_basicsize*/
8083 0, /*tp_itemsize*/
8084 /* methods */
8085 encoding_map_dealloc, /*tp_dealloc*/
8086 0, /*tp_print*/
8087 0, /*tp_getattr*/
8088 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008089 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 0, /*tp_repr*/
8091 0, /*tp_as_number*/
8092 0, /*tp_as_sequence*/
8093 0, /*tp_as_mapping*/
8094 0, /*tp_hash*/
8095 0, /*tp_call*/
8096 0, /*tp_str*/
8097 0, /*tp_getattro*/
8098 0, /*tp_setattro*/
8099 0, /*tp_as_buffer*/
8100 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8101 0, /*tp_doc*/
8102 0, /*tp_traverse*/
8103 0, /*tp_clear*/
8104 0, /*tp_richcompare*/
8105 0, /*tp_weaklistoffset*/
8106 0, /*tp_iter*/
8107 0, /*tp_iternext*/
8108 encoding_map_methods, /*tp_methods*/
8109 0, /*tp_members*/
8110 0, /*tp_getset*/
8111 0, /*tp_base*/
8112 0, /*tp_dict*/
8113 0, /*tp_descr_get*/
8114 0, /*tp_descr_set*/
8115 0, /*tp_dictoffset*/
8116 0, /*tp_init*/
8117 0, /*tp_alloc*/
8118 0, /*tp_new*/
8119 0, /*tp_free*/
8120 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121};
8122
8123PyObject*
8124PyUnicode_BuildEncodingMap(PyObject* string)
8125{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 PyObject *result;
8127 struct encoding_map *mresult;
8128 int i;
8129 int need_dict = 0;
8130 unsigned char level1[32];
8131 unsigned char level2[512];
8132 unsigned char *mlevel1, *mlevel2, *mlevel3;
8133 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 int kind;
8135 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008136 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 PyErr_BadArgument();
8141 return NULL;
8142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 kind = PyUnicode_KIND(string);
8144 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008145 length = PyUnicode_GET_LENGTH(string);
8146 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 memset(level1, 0xFF, sizeof level1);
8148 memset(level2, 0xFF, sizeof level2);
8149
8150 /* If there isn't a one-to-one mapping of NULL to \0,
8151 or if there are non-BMP characters, we need to use
8152 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008155 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 ch = PyUnicode_READ(kind, data, i);
8158 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 need_dict = 1;
8160 break;
8161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008162 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 /* unmapped character */
8164 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 l1 = ch >> 11;
8166 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 if (level1[l1] == 0xFF)
8168 level1[l1] = count2++;
8169 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 }
8172
8173 if (count2 >= 0xFF || count3 >= 0xFF)
8174 need_dict = 1;
8175
8176 if (need_dict) {
8177 PyObject *result = PyDict_New();
8178 PyObject *key, *value;
8179 if (!result)
8180 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008181 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008183 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184 if (!key || !value)
8185 goto failed1;
8186 if (PyDict_SetItem(result, key, value) == -1)
8187 goto failed1;
8188 Py_DECREF(key);
8189 Py_DECREF(value);
8190 }
8191 return result;
8192 failed1:
8193 Py_XDECREF(key);
8194 Py_XDECREF(value);
8195 Py_DECREF(result);
8196 return NULL;
8197 }
8198
8199 /* Create a three-level trie */
8200 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8201 16*count2 + 128*count3 - 1);
8202 if (!result)
8203 return PyErr_NoMemory();
8204 PyObject_Init(result, &EncodingMapType);
8205 mresult = (struct encoding_map*)result;
8206 mresult->count2 = count2;
8207 mresult->count3 = count3;
8208 mlevel1 = mresult->level1;
8209 mlevel2 = mresult->level23;
8210 mlevel3 = mresult->level23 + 16*count2;
8211 memcpy(mlevel1, level1, 32);
8212 memset(mlevel2, 0xFF, 16*count2);
8213 memset(mlevel3, 0, 128*count3);
8214 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008217 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8218 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219 /* unmapped character */
8220 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008221 o1 = ch>>11;
8222 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 i2 = 16*mlevel1[o1] + o2;
8224 if (mlevel2[i2] == 0xFF)
8225 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008226 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 i3 = 128*mlevel2[i2] + o3;
8228 mlevel3[i3] = i;
8229 }
8230 return result;
8231}
8232
8233static int
Victor Stinner22168992011-11-20 17:09:18 +01008234encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235{
8236 struct encoding_map *map = (struct encoding_map*)mapping;
8237 int l1 = c>>11;
8238 int l2 = (c>>7) & 0xF;
8239 int l3 = c & 0x7F;
8240 int i;
8241
Victor Stinner22168992011-11-20 17:09:18 +01008242 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244 if (c == 0)
8245 return 0;
8246 /* level 1*/
8247 i = map->level1[l1];
8248 if (i == 0xFF) {
8249 return -1;
8250 }
8251 /* level 2*/
8252 i = map->level23[16*i+l2];
8253 if (i == 0xFF) {
8254 return -1;
8255 }
8256 /* level 3 */
8257 i = map->level23[16*map->count2 + 128*i + l3];
8258 if (i == 0) {
8259 return -1;
8260 }
8261 return i;
8262}
8263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264/* Lookup the character ch in the mapping. If the character
8265 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008266 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008268charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269{
Christian Heimes217cfd12007-12-02 14:31:20 +00008270 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 PyObject *x;
8272
8273 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 x = PyObject_GetItem(mapping, w);
8276 Py_DECREF(w);
8277 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8279 /* No mapping found means: mapping is undefined. */
8280 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008281 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 } else
8283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008285 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008287 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 long value = PyLong_AS_LONG(x);
8289 if (value < 0 || value > 255) {
8290 PyErr_SetString(PyExc_TypeError,
8291 "character mapping must be in range(256)");
8292 Py_DECREF(x);
8293 return NULL;
8294 }
8295 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008297 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 /* wrong return value */
8301 PyErr_Format(PyExc_TypeError,
8302 "character mapping must return integer, bytes or None, not %.400s",
8303 x->ob_type->tp_name);
8304 Py_DECREF(x);
8305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 }
8307}
8308
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008310charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008312 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8313 /* exponentially overallocate to minimize reallocations */
8314 if (requiredsize < 2*outsize)
8315 requiredsize = 2*outsize;
8316 if (_PyBytes_Resize(outobj, requiredsize))
8317 return -1;
8318 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319}
8320
Benjamin Peterson14339b62009-01-31 16:36:08 +00008321typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008325 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 space is available. Return a new reference to the object that
8327 was put in the output buffer, or Py_None, if the mapping was undefined
8328 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008329 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008331charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008332 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 PyObject *rep;
8335 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008336 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337
Christian Heimes90aa7642007-12-19 02:45:37 +00008338 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 if (res == -1)
8342 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 if (outsize<requiredsize)
8344 if (charmapencode_resize(outobj, outpos, requiredsize))
8345 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008346 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 outstart[(*outpos)++] = (char)res;
8348 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 }
8350
8351 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008354 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 Py_DECREF(rep);
8356 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 if (PyLong_Check(rep)) {
8359 Py_ssize_t requiredsize = *outpos+1;
8360 if (outsize<requiredsize)
8361 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8362 Py_DECREF(rep);
8363 return enc_EXCEPTION;
8364 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008365 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008367 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 else {
8369 const char *repchars = PyBytes_AS_STRING(rep);
8370 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8371 Py_ssize_t requiredsize = *outpos+repsize;
8372 if (outsize<requiredsize)
8373 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8374 Py_DECREF(rep);
8375 return enc_EXCEPTION;
8376 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008377 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 memcpy(outstart + *outpos, repchars, repsize);
8379 *outpos += repsize;
8380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008382 Py_DECREF(rep);
8383 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384}
8385
8386/* handle an error in PyUnicode_EncodeCharmap
8387 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008388static int
8389charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008392 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008393 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394{
8395 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008398 enum PyUnicode_Kind kind;
8399 void *data;
8400 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008402 Py_ssize_t collstartpos = *inpos;
8403 Py_ssize_t collendpos = *inpos+1;
8404 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008405 const char *encoding = "charmap";
8406 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008407 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008408 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008409 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410
Benjamin Petersonbac79492012-01-14 13:34:47 -05008411 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 return -1;
8413 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 /* find all unencodable characters */
8415 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008417 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008418 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008419 val = encoding_map_lookup(ch, mapping);
8420 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 break;
8422 ++collendpos;
8423 continue;
8424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008426 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8427 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 if (rep==NULL)
8429 return -1;
8430 else if (rep!=Py_None) {
8431 Py_DECREF(rep);
8432 break;
8433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008434 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 }
8437 /* cache callback name lookup
8438 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008439 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008440 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008441
8442 switch (*error_handler) {
8443 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008444 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008446
8447 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 x = charmapencode_output('?', mapping, res, respos);
8450 if (x==enc_EXCEPTION) {
8451 return -1;
8452 }
8453 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008454 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 return -1;
8456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 }
8458 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008459 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 *inpos = collendpos;
8461 break;
Victor Stinner50149202015-09-22 00:26:54 +02008462
8463 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 /* generate replacement (temporarily (mis)uses p) */
8465 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 char buffer[2+29+1+1];
8467 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008468 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 for (cp = buffer; *cp; ++cp) {
8470 x = charmapencode_output(*cp, mapping, res, respos);
8471 if (x==enc_EXCEPTION)
8472 return -1;
8473 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008474 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 }
8478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 *inpos = collendpos;
8480 break;
Victor Stinner50149202015-09-22 00:26:54 +02008481
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 default:
Victor Stinner50149202015-09-22 00:26:54 +02008483 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008484 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008488 if (PyBytes_Check(repunicode)) {
8489 /* Directly copy bytes result to output. */
8490 Py_ssize_t outsize = PyBytes_Size(*res);
8491 Py_ssize_t requiredsize;
8492 repsize = PyBytes_Size(repunicode);
8493 requiredsize = *respos + repsize;
8494 if (requiredsize > outsize)
8495 /* Make room for all additional bytes. */
8496 if (charmapencode_resize(res, respos, requiredsize)) {
8497 Py_DECREF(repunicode);
8498 return -1;
8499 }
8500 memcpy(PyBytes_AsString(*res) + *respos,
8501 PyBytes_AsString(repunicode), repsize);
8502 *respos += repsize;
8503 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008504 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008505 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008508 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008509 Py_DECREF(repunicode);
8510 return -1;
8511 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008512 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008513 data = PyUnicode_DATA(repunicode);
8514 kind = PyUnicode_KIND(repunicode);
8515 for (index = 0; index < repsize; index++) {
8516 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8517 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008519 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 return -1;
8521 }
8522 else if (x==enc_FAILED) {
8523 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008524 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 return -1;
8526 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008527 }
8528 *inpos = newpos;
8529 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 return 0;
8532}
8533
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008535_PyUnicode_EncodeCharmap(PyObject *unicode,
8536 PyObject *mapping,
8537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 /* output object */
8540 PyObject *res = NULL;
8541 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008545 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008546 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008548 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008549 void *data;
8550 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
Benjamin Petersonbac79492012-01-14 13:34:47 -05008552 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 return NULL;
8554 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008555 data = PyUnicode_DATA(unicode);
8556 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008557
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 /* Default to Latin-1 */
8559 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 /* allocate enough for a simple encoding without
8563 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008564 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 if (res == NULL)
8566 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008567 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008571 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 if (x==enc_EXCEPTION) /* error */
8575 goto onError;
8576 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008577 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008579 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 &res, &respos)) {
8581 goto onError;
8582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 else
8585 /* done with this character => adjust input position */
8586 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008590 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008591 if (_PyBytes_Resize(&res, respos) < 0)
8592 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008593
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008595 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 return res;
8597
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 Py_XDECREF(res);
8600 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008601 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 return NULL;
8603}
8604
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008605/* Deprecated */
8606PyObject *
8607PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8608 Py_ssize_t size,
8609 PyObject *mapping,
8610 const char *errors)
8611{
8612 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008613 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 if (unicode == NULL)
8615 return NULL;
8616 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8617 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008618 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619}
8620
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621PyObject *
8622PyUnicode_AsCharmapString(PyObject *unicode,
8623 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624{
8625 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 PyErr_BadArgument();
8627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008629 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630}
8631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633static void
8634make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636 Py_ssize_t startpos, Py_ssize_t endpos,
8637 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 *exceptionObject = _PyUnicodeTranslateError_Create(
8641 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 }
8643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8645 goto onError;
8646 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8647 goto onError;
8648 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8649 goto onError;
8650 return;
8651 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008652 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 }
8654}
8655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656/* error handling callback helper:
8657 build arguments, call the callback and check the arguments,
8658 put the result into newpos and return the replacement string, which
8659 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660static PyObject *
8661unicode_translate_call_errorhandler(const char *errors,
8662 PyObject **errorHandler,
8663 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665 Py_ssize_t startpos, Py_ssize_t endpos,
8666 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008668 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008670 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 PyObject *restuple;
8672 PyObject *resunicode;
8673
8674 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 }
8679
8680 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008685 restuple = PyObject_CallFunctionObjArgs(
8686 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008690 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 Py_DECREF(restuple);
8692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008694 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 &resunicode, &i_newpos)) {
8696 Py_DECREF(restuple);
8697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008701 else
8702 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008704 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 Py_DECREF(restuple);
8706 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 Py_INCREF(resunicode);
8709 Py_DECREF(restuple);
8710 return resunicode;
8711}
8712
8713/* Lookup the character ch in the mapping and put the result in result,
8714 which must be decrefed by the caller.
8715 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008716static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718{
Christian Heimes217cfd12007-12-02 14:31:20 +00008719 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 PyObject *x;
8721
8722 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 x = PyObject_GetItem(mapping, w);
8725 Py_DECREF(w);
8726 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8728 /* No mapping found means: use 1:1 mapping. */
8729 PyErr_Clear();
8730 *result = NULL;
8731 return 0;
8732 } else
8733 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734 }
8735 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 *result = x;
8737 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008739 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008741 if (value < 0 || value > MAX_UNICODE) {
8742 PyErr_Format(PyExc_ValueError,
8743 "character mapping must be in range(0x%x)",
8744 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 Py_DECREF(x);
8746 return -1;
8747 }
8748 *result = x;
8749 return 0;
8750 }
8751 else if (PyUnicode_Check(x)) {
8752 *result = x;
8753 return 0;
8754 }
8755 else {
8756 /* wrong return value */
8757 PyErr_SetString(PyExc_TypeError,
8758 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008759 Py_DECREF(x);
8760 return -1;
8761 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762}
Victor Stinner1194ea02014-04-04 19:37:40 +02008763
8764/* lookup the character, write the result into the writer.
8765 Return 1 if the result was written into the writer, return 0 if the mapping
8766 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008767static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008768charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8769 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770{
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 PyObject *item;
8772
8773 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008775
8776 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008778 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008781 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008783
8784 if (item == Py_None) {
8785 Py_DECREF(item);
8786 return 0;
8787 }
8788
8789 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008790 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8791 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8792 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008793 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8794 Py_DECREF(item);
8795 return -1;
8796 }
8797 Py_DECREF(item);
8798 return 1;
8799 }
8800
8801 if (!PyUnicode_Check(item)) {
8802 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008804 }
8805
8806 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8807 Py_DECREF(item);
8808 return -1;
8809 }
8810
8811 Py_DECREF(item);
8812 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813}
8814
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815static int
8816unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8817 Py_UCS1 *translate)
8818{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008819 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 int ret = 0;
8821
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 if (charmaptranslate_lookup(ch, mapping, &item)) {
8823 return -1;
8824 }
8825
8826 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008827 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008828 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008831 /* not found => default to 1:1 mapping */
8832 translate[ch] = ch;
8833 return 1;
8834 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008835 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008836 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008837 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8838 used it */
8839 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008840 /* invalid character or character outside ASCII:
8841 skip the fast translate */
8842 goto exit;
8843 }
8844 translate[ch] = (Py_UCS1)replace;
8845 }
8846 else if (PyUnicode_Check(item)) {
8847 Py_UCS4 replace;
8848
8849 if (PyUnicode_READY(item) == -1) {
8850 Py_DECREF(item);
8851 return -1;
8852 }
8853 if (PyUnicode_GET_LENGTH(item) != 1)
8854 goto exit;
8855
8856 replace = PyUnicode_READ_CHAR(item, 0);
8857 if (replace > 127)
8858 goto exit;
8859 translate[ch] = (Py_UCS1)replace;
8860 }
8861 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008862 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 goto exit;
8864 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 ret = 1;
8866
Benjamin Peterson1365de72014-04-07 20:15:41 -04008867 exit:
8868 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869 return ret;
8870}
8871
8872/* Fast path for ascii => ascii translation. Return 1 if the whole string
8873 was translated into writer, return 0 if the input string was partially
8874 translated into writer, raise an exception and return -1 on error. */
8875static int
8876unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008877 _PyUnicodeWriter *writer, int ignore,
8878 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879{
Victor Stinner872b2912014-04-05 14:27:07 +02008880 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881 Py_ssize_t len;
8882 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008883 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884
Victor Stinner89a76ab2014-04-05 11:44:04 +02008885 len = PyUnicode_GET_LENGTH(input);
8886
Victor Stinner872b2912014-04-05 14:27:07 +02008887 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888
8889 in = PyUnicode_1BYTE_DATA(input);
8890 end = in + len;
8891
8892 assert(PyUnicode_IS_ASCII(writer->buffer));
8893 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8894 out = PyUnicode_1BYTE_DATA(writer->buffer);
8895
Victor Stinner872b2912014-04-05 14:27:07 +02008896 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008898 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008900 int translate = unicode_fast_translate_lookup(mapping, ch,
8901 ascii_table);
8902 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008904 if (translate == 0)
8905 goto exit;
8906 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907 }
Victor Stinner872b2912014-04-05 14:27:07 +02008908 if (ch2 == 0xfe) {
8909 if (ignore)
8910 continue;
8911 goto exit;
8912 }
8913 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008915 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916 }
Victor Stinner872b2912014-04-05 14:27:07 +02008917 res = 1;
8918
8919exit:
8920 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008921 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008922 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008923}
8924
Victor Stinner3222da22015-10-01 22:07:32 +02008925static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926_PyUnicode_TranslateCharmap(PyObject *input,
8927 PyObject *mapping,
8928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 Py_ssize_t size, i;
8933 int kind;
8934 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008935 _PyUnicodeWriter writer;
8936 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008937 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 PyObject *errorHandler = NULL;
8939 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008940 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008941 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 PyErr_BadArgument();
8945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 if (PyUnicode_READY(input) == -1)
8949 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 kind = PyUnicode_KIND(input);
8952 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008954 if (size == 0)
8955 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008957 /* allocate enough for a simple 1:1 translation without
8958 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008959 _PyUnicodeWriter_Init(&writer);
8960 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Victor Stinner872b2912014-04-05 14:27:07 +02008963 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8964
Victor Stinner33798672016-03-01 21:59:58 +01008965 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008966 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008967 if (PyUnicode_IS_ASCII(input)) {
8968 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8969 if (res < 0) {
8970 _PyUnicodeWriter_Dealloc(&writer);
8971 return NULL;
8972 }
8973 if (res == 1)
8974 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975 }
Victor Stinner33798672016-03-01 21:59:58 +01008976 else {
8977 i = 0;
8978 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 int translate;
8983 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8984 Py_ssize_t newpos;
8985 /* startpos for collecting untranslatable chars */
8986 Py_ssize_t collstart;
8987 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 ch = PyUnicode_READ(kind, data, i);
8991 translate = charmaptranslate_output(ch, mapping, &writer);
8992 if (translate < 0)
8993 goto onError;
8994
8995 if (translate != 0) {
8996 /* it worked => adjust input pointer */
8997 ++i;
8998 continue;
8999 }
9000
9001 /* untranslatable character */
9002 collstart = i;
9003 collend = i+1;
9004
9005 /* find all untranslatable characters */
9006 while (collend < size) {
9007 PyObject *x;
9008 ch = PyUnicode_READ(kind, data, collend);
9009 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009010 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 Py_XDECREF(x);
9012 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 ++collend;
9015 }
9016
9017 if (ignore) {
9018 i = collend;
9019 }
9020 else {
9021 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9022 reason, input, &exc,
9023 collstart, collend, &newpos);
9024 if (repunicode == NULL)
9025 goto onError;
9026 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009028 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009029 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 Py_DECREF(repunicode);
9031 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009032 }
9033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 Py_XDECREF(exc);
9035 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009036 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009039 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009040 Py_XDECREF(exc);
9041 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 return NULL;
9043}
9044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045/* Deprecated. Use PyUnicode_Translate instead. */
9046PyObject *
9047PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9048 Py_ssize_t size,
9049 PyObject *mapping,
9050 const char *errors)
9051{
Christian Heimes5f520f42012-09-11 14:03:25 +02009052 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009053 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (!unicode)
9055 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009056 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9057 Py_DECREF(unicode);
9058 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059}
9060
Alexander Belopolsky40018472011-02-26 01:02:56 +00009061PyObject *
9062PyUnicode_Translate(PyObject *str,
9063 PyObject *mapping,
9064 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009066 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009067 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009068 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069}
Tim Petersced69f82003-09-16 20:30:58 +00009070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071PyObject *
9072_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9073{
9074 if (!PyUnicode_Check(unicode)) {
9075 PyErr_BadInternalCall();
9076 return NULL;
9077 }
9078 if (PyUnicode_READY(unicode) == -1)
9079 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009080 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 /* If the string is already ASCII, just return the same string */
9082 Py_INCREF(unicode);
9083 return unicode;
9084 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009085
9086 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9087 PyObject *result = PyUnicode_New(len, 127);
9088 if (result == NULL) {
9089 return NULL;
9090 }
9091
9092 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9093 int kind = PyUnicode_KIND(unicode);
9094 const void *data = PyUnicode_DATA(unicode);
9095 Py_ssize_t i;
9096 for (i = 0; i < len; ++i) {
9097 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9098 if (ch < 127) {
9099 out[i] = ch;
9100 }
9101 else if (Py_UNICODE_ISSPACE(ch)) {
9102 out[i] = ' ';
9103 }
9104 else {
9105 int decimal = Py_UNICODE_TODECIMAL(ch);
9106 if (decimal < 0) {
9107 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009108 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009109 _PyUnicode_LENGTH(result) = i + 1;
9110 break;
9111 }
9112 out[i] = '0' + decimal;
9113 }
9114 }
9115
INADA Naoki16dfca42018-07-14 12:06:43 +09009116 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009117 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118}
9119
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009120PyObject *
9121PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9122 Py_ssize_t length)
9123{
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009125 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009126 Py_UCS4 maxchar;
9127 enum PyUnicode_Kind kind;
9128 void *data;
9129
Victor Stinner99d7ad02012-02-22 13:37:39 +01009130 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009132 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009133 if (ch > 127) {
9134 int decimal = Py_UNICODE_TODECIMAL(ch);
9135 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009136 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009137 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009138 }
9139 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009140
9141 /* Copy to a new string */
9142 decimal = PyUnicode_New(length, maxchar);
9143 if (decimal == NULL)
9144 return decimal;
9145 kind = PyUnicode_KIND(decimal);
9146 data = PyUnicode_DATA(decimal);
9147 /* Iterate over code points */
9148 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009149 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009150 if (ch > 127) {
9151 int decimal = Py_UNICODE_TODECIMAL(ch);
9152 if (decimal >= 0)
9153 ch = '0' + decimal;
9154 }
9155 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009157 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009158}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009159/* --- Decimal Encoder ---------------------------------------------------- */
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161int
9162PyUnicode_EncodeDecimal(Py_UNICODE *s,
9163 Py_ssize_t length,
9164 char *output,
9165 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009166{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009167 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009168 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009169 enum PyUnicode_Kind kind;
9170 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009171
9172 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 PyErr_BadArgument();
9174 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009175 }
9176
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009177 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009178 if (unicode == NULL)
9179 return -1;
9180
Victor Stinner42bf7752011-11-21 22:52:58 +01009181 kind = PyUnicode_KIND(unicode);
9182 data = PyUnicode_DATA(unicode);
9183
Victor Stinnerb84d7232011-11-22 01:50:07 +01009184 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009185 PyObject *exc;
9186 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009188 Py_ssize_t startpos;
9189
9190 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009191
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009193 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009194 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 decimal = Py_UNICODE_TODECIMAL(ch);
9198 if (decimal >= 0) {
9199 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009200 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 continue;
9202 }
9203 if (0 < ch && ch < 256) {
9204 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009205 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 continue;
9207 }
Victor Stinner6345be92011-11-25 20:09:01 +01009208
Victor Stinner42bf7752011-11-21 22:52:58 +01009209 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009210 exc = NULL;
9211 raise_encode_exception(&exc, "decimal", unicode,
9212 startpos, startpos+1,
9213 "invalid decimal Unicode string");
9214 Py_XDECREF(exc);
9215 Py_DECREF(unicode);
9216 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009217 }
9218 /* 0-terminate the output string */
9219 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009220 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009222}
9223
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224/* --- Helpers ------------------------------------------------------------ */
9225
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009226/* helper macro to fixup start/end slice values */
9227#define ADJUST_INDICES(start, end, len) \
9228 if (end > len) \
9229 end = len; \
9230 else if (end < 0) { \
9231 end += len; \
9232 if (end < 0) \
9233 end = 0; \
9234 } \
9235 if (start < 0) { \
9236 start += len; \
9237 if (start < 0) \
9238 start = 0; \
9239 }
9240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009244 Py_ssize_t end,
9245 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009247 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 void *buf1, *buf2;
9249 Py_ssize_t len1, len2, result;
9250
9251 kind1 = PyUnicode_KIND(s1);
9252 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009253 if (kind1 < kind2)
9254 return -1;
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 len1 = PyUnicode_GET_LENGTH(s1);
9257 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009258 ADJUST_INDICES(start, end, len1);
9259 if (end - start < len2)
9260 return -1;
9261
9262 buf1 = PyUnicode_DATA(s1);
9263 buf2 = PyUnicode_DATA(s2);
9264 if (len2 == 1) {
9265 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9266 result = findchar((const char *)buf1 + kind1*start,
9267 kind1, end - start, ch, direction);
9268 if (result == -1)
9269 return -1;
9270 else
9271 return start + result;
9272 }
9273
9274 if (kind2 != kind1) {
9275 buf2 = _PyUnicode_AsKind(s2, kind1);
9276 if (!buf2)
9277 return -2;
9278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279
Victor Stinner794d5672011-10-10 03:21:36 +02009280 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009282 case PyUnicode_1BYTE_KIND:
9283 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9284 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9285 else
9286 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9287 break;
9288 case PyUnicode_2BYTE_KIND:
9289 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 case PyUnicode_4BYTE_KIND:
9292 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9293 break;
9294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009295 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009296 }
9297 }
9298 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009299 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009300 case PyUnicode_1BYTE_KIND:
9301 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9302 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 else
9304 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305 break;
9306 case PyUnicode_2BYTE_KIND:
9307 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 case PyUnicode_4BYTE_KIND:
9310 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311 break;
9312 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009313 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 }
9316
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009317 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 PyMem_Free(buf2);
9319
9320 return result;
9321}
9322
9323Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009324_PyUnicode_InsertThousandsGrouping(
9325 PyObject *unicode, Py_ssize_t index,
9326 Py_ssize_t n_buffer,
9327 void *digits, Py_ssize_t n_digits,
9328 Py_ssize_t min_width,
9329 const char *grouping, PyObject *thousands_sep,
9330 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331{
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009333 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 Py_ssize_t thousands_sep_len;
9335 Py_ssize_t len;
9336
9337 if (unicode != NULL) {
9338 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009339 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 }
9341 else {
9342 kind = PyUnicode_1BYTE_KIND;
9343 data = NULL;
9344 }
9345 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9346 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9347 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9348 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009349 if (thousands_sep_kind < kind) {
9350 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9351 if (!thousands_sep_data)
9352 return -1;
9353 }
9354 else {
9355 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9356 if (!data)
9357 return -1;
9358 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 }
9360
Benjamin Petersonead6b532011-12-20 17:23:42 -06009361 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009363 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009367 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009370 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009372 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009378 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009384 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 break;
9386 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009387 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009389 if (unicode != NULL && thousands_sep_kind != kind) {
9390 if (thousands_sep_kind < kind)
9391 PyMem_Free(thousands_sep_data);
9392 else
9393 PyMem_Free(data);
9394 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009395 if (unicode == NULL) {
9396 *maxchar = 127;
9397 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009398 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009399 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 }
9401 }
9402 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403}
9404
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406Py_ssize_t
9407PyUnicode_Count(PyObject *str,
9408 PyObject *substr,
9409 Py_ssize_t start,
9410 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009412 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 void *buf1 = NULL, *buf2 = NULL;
9415 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009416
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 kind1 = PyUnicode_KIND(str);
9421 kind2 = PyUnicode_KIND(substr);
9422 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009425 len1 = PyUnicode_GET_LENGTH(str);
9426 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 buf1 = PyUnicode_DATA(str);
9432 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 if (!buf2)
9436 goto onError;
9437 }
9438
9439 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009442 result = asciilib_count(
9443 ((Py_UCS1*)buf1) + start, end - start,
9444 buf2, len2, PY_SSIZE_T_MAX
9445 );
9446 else
9447 result = ucs1lib_count(
9448 ((Py_UCS1*)buf1) + start, end - start,
9449 buf2, len2, PY_SSIZE_T_MAX
9450 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 break;
9452 case PyUnicode_2BYTE_KIND:
9453 result = ucs2lib_count(
9454 ((Py_UCS2*)buf1) + start, end - start,
9455 buf2, len2, PY_SSIZE_T_MAX
9456 );
9457 break;
9458 case PyUnicode_4BYTE_KIND:
9459 result = ucs4lib_count(
9460 ((Py_UCS4*)buf1) + start, end - start,
9461 buf2, len2, PY_SSIZE_T_MAX
9462 );
9463 break;
9464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009465 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009468 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 PyMem_Free(buf2);
9470
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009473 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 PyMem_Free(buf2);
9475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Alexander Belopolsky40018472011-02-26 01:02:56 +00009478Py_ssize_t
9479PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009480 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481 Py_ssize_t start,
9482 Py_ssize_t end,
9483 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009485 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489}
9490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491Py_ssize_t
9492PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9493 Py_ssize_t start, Py_ssize_t end,
9494 int direction)
9495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009497 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (PyUnicode_READY(str) == -1)
9499 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009500 len = PyUnicode_GET_LENGTH(str);
9501 ADJUST_INDICES(start, end, len);
9502 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009503 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009505 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9506 kind, end-start, ch, direction);
9507 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009509 else
9510 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511}
9512
Alexander Belopolsky40018472011-02-26 01:02:56 +00009513static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009514tailmatch(PyObject *self,
9515 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516 Py_ssize_t start,
9517 Py_ssize_t end,
9518 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 int kind_self;
9521 int kind_sub;
9522 void *data_self;
9523 void *data_sub;
9524 Py_ssize_t offset;
9525 Py_ssize_t i;
9526 Py_ssize_t end_sub;
9527
9528 if (PyUnicode_READY(self) == -1 ||
9529 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009530 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9533 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009537 if (PyUnicode_GET_LENGTH(substring) == 0)
9538 return 1;
9539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 kind_self = PyUnicode_KIND(self);
9541 data_self = PyUnicode_DATA(self);
9542 kind_sub = PyUnicode_KIND(substring);
9543 data_sub = PyUnicode_DATA(substring);
9544 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9545
9546 if (direction > 0)
9547 offset = end;
9548 else
9549 offset = start;
9550
9551 if (PyUnicode_READ(kind_self, data_self, offset) ==
9552 PyUnicode_READ(kind_sub, data_sub, 0) &&
9553 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9554 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9555 /* If both are of the same kind, memcmp is sufficient */
9556 if (kind_self == kind_sub) {
9557 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009558 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 data_sub,
9560 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009563 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 else {
9565 /* We do not need to compare 0 and len(substring)-1 because
9566 the if statement above ensured already that they are equal
9567 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 for (i = 1; i < end_sub; ++i) {
9569 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9570 PyUnicode_READ(kind_sub, data_sub, i))
9571 return 0;
9572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575 }
9576
9577 return 0;
9578}
9579
Alexander Belopolsky40018472011-02-26 01:02:56 +00009580Py_ssize_t
9581PyUnicode_Tailmatch(PyObject *str,
9582 PyObject *substr,
9583 Py_ssize_t start,
9584 Py_ssize_t end,
9585 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009587 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591}
9592
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009593static PyObject *
9594ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009596 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9597 char *resdata, *data = PyUnicode_DATA(self);
9598 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009599
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009600 res = PyUnicode_New(len, 127);
9601 if (res == NULL)
9602 return NULL;
9603 resdata = PyUnicode_DATA(res);
9604 if (lower)
9605 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009607 _Py_bytes_upper(resdata, data, len);
9608 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609}
9610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009612handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009614 Py_ssize_t j;
9615 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009616 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009618
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009619 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9620
9621 where ! is a negation and \p{xxx} is a character with property xxx.
9622 */
9623 for (j = i - 1; j >= 0; j--) {
9624 c = PyUnicode_READ(kind, data, j);
9625 if (!_PyUnicode_IsCaseIgnorable(c))
9626 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009628 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9629 if (final_sigma) {
9630 for (j = i + 1; j < length; j++) {
9631 c = PyUnicode_READ(kind, data, j);
9632 if (!_PyUnicode_IsCaseIgnorable(c))
9633 break;
9634 }
9635 final_sigma = j == length || !_PyUnicode_IsCased(c);
9636 }
9637 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638}
9639
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640static int
9641lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9642 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 /* Obscure special case. */
9645 if (c == 0x3A3) {
9646 mapped[0] = handle_capital_sigma(kind, data, length, i);
9647 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650}
9651
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652static Py_ssize_t
9653do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 Py_ssize_t i, k = 0;
9656 int n_res, j;
9657 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009658
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 c = PyUnicode_READ(kind, data, 0);
9660 n_res = _PyUnicode_ToUpperFull(c, mapped);
9661 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009662 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 for (i = 1; i < length; i++) {
9666 c = PyUnicode_READ(kind, data, i);
9667 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9668 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009669 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009670 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009671 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009672 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674}
9675
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676static Py_ssize_t
9677do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9678 Py_ssize_t i, k = 0;
9679
9680 for (i = 0; i < length; i++) {
9681 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9682 int n_res, j;
9683 if (Py_UNICODE_ISUPPER(c)) {
9684 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9685 }
9686 else if (Py_UNICODE_ISLOWER(c)) {
9687 n_res = _PyUnicode_ToUpperFull(c, mapped);
9688 }
9689 else {
9690 n_res = 1;
9691 mapped[0] = c;
9692 }
9693 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009694 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 res[k++] = mapped[j];
9696 }
9697 }
9698 return k;
9699}
9700
9701static Py_ssize_t
9702do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9703 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 Py_ssize_t i, k = 0;
9706
9707 for (i = 0; i < length; i++) {
9708 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9709 int n_res, j;
9710 if (lower)
9711 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712 else
9713 n_res = _PyUnicode_ToUpperFull(c, mapped);
9714 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009715 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 res[k++] = mapped[j];
9717 }
9718 }
9719 return k;
9720}
9721
9722static Py_ssize_t
9723do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9724{
9725 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9726}
9727
9728static Py_ssize_t
9729do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9730{
9731 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9732}
9733
Benjamin Petersone51757f2012-01-12 21:10:29 -05009734static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009735do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9736{
9737 Py_ssize_t i, k = 0;
9738
9739 for (i = 0; i < length; i++) {
9740 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9741 Py_UCS4 mapped[3];
9742 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9743 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009744 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009745 res[k++] = mapped[j];
9746 }
9747 }
9748 return k;
9749}
9750
9751static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009752do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9753{
9754 Py_ssize_t i, k = 0;
9755 int previous_is_cased;
9756
9757 previous_is_cased = 0;
9758 for (i = 0; i < length; i++) {
9759 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9760 Py_UCS4 mapped[3];
9761 int n_res, j;
9762
9763 if (previous_is_cased)
9764 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9765 else
9766 n_res = _PyUnicode_ToTitleFull(c, mapped);
9767
9768 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009769 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009770 res[k++] = mapped[j];
9771 }
9772
9773 previous_is_cased = _PyUnicode_IsCased(c);
9774 }
9775 return k;
9776}
9777
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778static PyObject *
9779case_operation(PyObject *self,
9780 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9781{
9782 PyObject *res = NULL;
9783 Py_ssize_t length, newlength = 0;
9784 int kind, outkind;
9785 void *data, *outdata;
9786 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9787
Benjamin Petersoneea48462012-01-16 14:28:50 -05009788 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009789
9790 kind = PyUnicode_KIND(self);
9791 data = PyUnicode_DATA(self);
9792 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009793 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009794 PyErr_SetString(PyExc_OverflowError, "string is too long");
9795 return NULL;
9796 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009797 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009798 if (tmp == NULL)
9799 return PyErr_NoMemory();
9800 newlength = perform(kind, data, length, tmp, &maxchar);
9801 res = PyUnicode_New(newlength, maxchar);
9802 if (res == NULL)
9803 goto leave;
9804 tmpend = tmp + newlength;
9805 outdata = PyUnicode_DATA(res);
9806 outkind = PyUnicode_KIND(res);
9807 switch (outkind) {
9808 case PyUnicode_1BYTE_KIND:
9809 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9810 break;
9811 case PyUnicode_2BYTE_KIND:
9812 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9813 break;
9814 case PyUnicode_4BYTE_KIND:
9815 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9816 break;
9817 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009818 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819 }
9820 leave:
9821 PyMem_FREE(tmp);
9822 return res;
9823}
9824
Tim Peters8ce9f162004-08-27 01:49:32 +00009825PyObject *
9826PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009828 PyObject *res;
9829 PyObject *fseq;
9830 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009831 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009833 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009834 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009836 }
9837
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009838 /* NOTE: the following code can't call back into Python code,
9839 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009840 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009841
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009842 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009843 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009844 res = _PyUnicode_JoinArray(separator, items, seqlen);
9845 Py_DECREF(fseq);
9846 return res;
9847}
9848
9849PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009850_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009851{
9852 PyObject *res = NULL; /* the result */
9853 PyObject *sep = NULL;
9854 Py_ssize_t seplen;
9855 PyObject *item;
9856 Py_ssize_t sz, i, res_offset;
9857 Py_UCS4 maxchar;
9858 Py_UCS4 item_maxchar;
9859 int use_memcpy;
9860 unsigned char *res_data = NULL, *sep_data = NULL;
9861 PyObject *last_obj;
9862 unsigned int kind = 0;
9863
Tim Peters05eba1f2004-08-27 21:32:02 +00009864 /* If empty sequence, return u"". */
9865 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009866 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009867 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009868
Tim Peters05eba1f2004-08-27 21:32:02 +00009869 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009870 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009871 if (seqlen == 1) {
9872 if (PyUnicode_CheckExact(items[0])) {
9873 res = items[0];
9874 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009875 return res;
9876 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009877 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009878 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009879 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009880 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009881 /* Set up sep and seplen */
9882 if (separator == NULL) {
9883 /* fall back to a blank space separator */
9884 sep = PyUnicode_FromOrdinal(' ');
9885 if (!sep)
9886 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009887 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009888 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009889 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009890 else {
9891 if (!PyUnicode_Check(separator)) {
9892 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009893 "separator: expected str instance,"
9894 " %.80s found",
9895 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009896 goto onError;
9897 }
9898 if (PyUnicode_READY(separator))
9899 goto onError;
9900 sep = separator;
9901 seplen = PyUnicode_GET_LENGTH(separator);
9902 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9903 /* inc refcount to keep this code path symmetric with the
9904 above case of a blank separator */
9905 Py_INCREF(sep);
9906 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009907 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009908 }
9909
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009910 /* There are at least two things to join, or else we have a subclass
9911 * of str in the sequence.
9912 * Do a pre-pass to figure out the total amount of space we'll
9913 * need (sz), and see whether all argument are strings.
9914 */
9915 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009916#ifdef Py_DEBUG
9917 use_memcpy = 0;
9918#else
9919 use_memcpy = 1;
9920#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009921 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009922 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009924 if (!PyUnicode_Check(item)) {
9925 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009926 "sequence item %zd: expected str instance,"
9927 " %.80s found",
9928 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 goto onError;
9930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 if (PyUnicode_READY(item) == -1)
9932 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009933 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009935 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009936 if (i != 0) {
9937 add_sz += seplen;
9938 }
9939 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009940 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009941 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 goto onError;
9943 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009944 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009945 if (use_memcpy && last_obj != NULL) {
9946 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9947 use_memcpy = 0;
9948 }
9949 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009950 }
Tim Petersced69f82003-09-16 20:30:58 +00009951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 if (res == NULL)
9954 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009955
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009957#ifdef Py_DEBUG
9958 use_memcpy = 0;
9959#else
9960 if (use_memcpy) {
9961 res_data = PyUnicode_1BYTE_DATA(res);
9962 kind = PyUnicode_KIND(res);
9963 if (seplen != 0)
9964 sep_data = PyUnicode_1BYTE_DATA(sep);
9965 }
9966#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009967 if (use_memcpy) {
9968 for (i = 0; i < seqlen; ++i) {
9969 Py_ssize_t itemlen;
9970 item = items[i];
9971
9972 /* Copy item, and maybe the separator. */
9973 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009974 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 kind * seplen);
9977 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009979
9980 itemlen = PyUnicode_GET_LENGTH(item);
9981 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009982 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009983 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009984 kind * itemlen);
9985 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009986 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009987 }
9988 assert(res_data == PyUnicode_1BYTE_DATA(res)
9989 + kind * PyUnicode_GET_LENGTH(res));
9990 }
9991 else {
9992 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9993 Py_ssize_t itemlen;
9994 item = items[i];
9995
9996 /* Copy item, and maybe the separator. */
9997 if (i && seplen != 0) {
9998 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9999 res_offset += seplen;
10000 }
10001
10002 itemlen = PyUnicode_GET_LENGTH(item);
10003 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010004 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010005 res_offset += itemlen;
10006 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010007 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010009 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010012 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010017 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 return NULL;
10019}
10020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021#define FILL(kind, data, value, start, length) \
10022 do { \
10023 Py_ssize_t i_ = 0; \
10024 assert(kind != PyUnicode_WCHAR_KIND); \
10025 switch ((kind)) { \
10026 case PyUnicode_1BYTE_KIND: { \
10027 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010028 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 break; \
10030 } \
10031 case PyUnicode_2BYTE_KIND: { \
10032 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10033 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10034 break; \
10035 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010036 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10038 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10039 break; \
10040 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010041 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 } \
10043 } while (0)
10044
Victor Stinnerd3f08822012-05-29 12:57:52 +020010045void
10046_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10047 Py_UCS4 fill_char)
10048{
10049 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10050 const void *data = PyUnicode_DATA(unicode);
10051 assert(PyUnicode_IS_READY(unicode));
10052 assert(unicode_modifiable(unicode));
10053 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10054 assert(start >= 0);
10055 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10056 FILL(kind, data, fill_char, start, length);
10057}
10058
Victor Stinner3fe55312012-01-04 00:33:50 +010010059Py_ssize_t
10060PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10061 Py_UCS4 fill_char)
10062{
10063 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010064
10065 if (!PyUnicode_Check(unicode)) {
10066 PyErr_BadInternalCall();
10067 return -1;
10068 }
10069 if (PyUnicode_READY(unicode) == -1)
10070 return -1;
10071 if (unicode_check_modifiable(unicode))
10072 return -1;
10073
Victor Stinnerd3f08822012-05-29 12:57:52 +020010074 if (start < 0) {
10075 PyErr_SetString(PyExc_IndexError, "string index out of range");
10076 return -1;
10077 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010078 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10079 PyErr_SetString(PyExc_ValueError,
10080 "fill character is bigger than "
10081 "the string maximum character");
10082 return -1;
10083 }
10084
10085 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10086 length = Py_MIN(maxlen, length);
10087 if (length <= 0)
10088 return 0;
10089
Victor Stinnerd3f08822012-05-29 12:57:52 +020010090 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010091 return length;
10092}
10093
Victor Stinner9310abb2011-10-05 00:59:23 +020010094static PyObject *
10095pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010096 Py_ssize_t left,
10097 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 PyObject *u;
10101 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010102 int kind;
10103 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104
10105 if (left < 0)
10106 left = 0;
10107 if (right < 0)
10108 right = 0;
10109
Victor Stinnerc4b49542011-12-11 22:44:26 +010010110 if (left == 0 && right == 0)
10111 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10114 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010115 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10116 return NULL;
10117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010119 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010121 if (!u)
10122 return NULL;
10123
10124 kind = PyUnicode_KIND(u);
10125 data = PyUnicode_DATA(u);
10126 if (left)
10127 FILL(kind, data, fill, 0, left);
10128 if (right)
10129 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010131 assert(_PyUnicode_CheckConsistency(u, 1));
10132 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133}
10134
Alexander Belopolsky40018472011-02-26 01:02:56 +000010135PyObject *
10136PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010140 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
Benjamin Petersonead6b532011-12-20 17:23:42 -060010143 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010145 if (PyUnicode_IS_ASCII(string))
10146 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010147 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 PyUnicode_GET_LENGTH(string), keepends);
10149 else
10150 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 break;
10154 case PyUnicode_2BYTE_KIND:
10155 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 PyUnicode_GET_LENGTH(string), keepends);
10158 break;
10159 case PyUnicode_4BYTE_KIND:
10160 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010161 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyUnicode_GET_LENGTH(string), keepends);
10163 break;
10164 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010165 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Alexander Belopolsky40018472011-02-26 01:02:56 +000010170static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010171split(PyObject *self,
10172 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010173 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010175 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 void *buf1, *buf2;
10177 Py_ssize_t len1, len2;
10178 PyObject* out;
10179
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010181 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (PyUnicode_READY(self) == -1)
10184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010187 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 if (PyUnicode_IS_ASCII(self))
10190 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 PyUnicode_GET_LENGTH(self), maxcount
10193 );
10194 else
10195 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 PyUnicode_GET_LENGTH(self), maxcount
10198 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 case PyUnicode_2BYTE_KIND:
10200 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(self), maxcount
10203 );
10204 case PyUnicode_4BYTE_KIND:
10205 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 PyUnicode_GET_LENGTH(self), maxcount
10208 );
10209 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010210 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 }
10212
10213 if (PyUnicode_READY(substring) == -1)
10214 return NULL;
10215
10216 kind1 = PyUnicode_KIND(self);
10217 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 len1 = PyUnicode_GET_LENGTH(self);
10219 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010220 if (kind1 < kind2 || len1 < len2) {
10221 out = PyList_New(1);
10222 if (out == NULL)
10223 return NULL;
10224 Py_INCREF(self);
10225 PyList_SET_ITEM(out, 0, self);
10226 return out;
10227 }
10228 buf1 = PyUnicode_DATA(self);
10229 buf2 = PyUnicode_DATA(substring);
10230 if (kind2 != kind1) {
10231 buf2 = _PyUnicode_AsKind(substring, kind1);
10232 if (!buf2)
10233 return NULL;
10234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010236 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010238 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10239 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010241 else
10242 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 break;
10245 case PyUnicode_2BYTE_KIND:
10246 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 break;
10249 case PyUnicode_4BYTE_KIND:
10250 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010251 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 break;
10253 default:
10254 out = NULL;
10255 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010256 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 PyMem_Free(buf2);
10258 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259}
10260
Alexander Belopolsky40018472011-02-26 01:02:56 +000010261static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010262rsplit(PyObject *self,
10263 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010264 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010265{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010266 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 void *buf1, *buf2;
10268 Py_ssize_t len1, len2;
10269 PyObject* out;
10270
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010271 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010272 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (PyUnicode_READY(self) == -1)
10275 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010278 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 if (PyUnicode_IS_ASCII(self))
10281 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 PyUnicode_GET_LENGTH(self), maxcount
10284 );
10285 else
10286 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 PyUnicode_GET_LENGTH(self), maxcount
10289 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 case PyUnicode_2BYTE_KIND:
10291 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 PyUnicode_GET_LENGTH(self), maxcount
10294 );
10295 case PyUnicode_4BYTE_KIND:
10296 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010297 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 PyUnicode_GET_LENGTH(self), maxcount
10299 );
10300 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010301 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 }
10303
10304 if (PyUnicode_READY(substring) == -1)
10305 return NULL;
10306
10307 kind1 = PyUnicode_KIND(self);
10308 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 len1 = PyUnicode_GET_LENGTH(self);
10310 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010311 if (kind1 < kind2 || len1 < len2) {
10312 out = PyList_New(1);
10313 if (out == NULL)
10314 return NULL;
10315 Py_INCREF(self);
10316 PyList_SET_ITEM(out, 0, self);
10317 return out;
10318 }
10319 buf1 = PyUnicode_DATA(self);
10320 buf2 = PyUnicode_DATA(substring);
10321 if (kind2 != kind1) {
10322 buf2 = _PyUnicode_AsKind(substring, kind1);
10323 if (!buf2)
10324 return NULL;
10325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010327 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10330 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 else
10333 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 break;
10336 case PyUnicode_2BYTE_KIND:
10337 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 break;
10340 case PyUnicode_4BYTE_KIND:
10341 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 break;
10344 default:
10345 out = NULL;
10346 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010347 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 PyMem_Free(buf2);
10349 return out;
10350}
10351
10352static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10354 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010356 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010358 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10359 return asciilib_find(buf1, len1, buf2, len2, offset);
10360 else
10361 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 case PyUnicode_2BYTE_KIND:
10363 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10364 case PyUnicode_4BYTE_KIND:
10365 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10366 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010367 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368}
10369
10370static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10372 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010374 switch (kind) {
10375 case PyUnicode_1BYTE_KIND:
10376 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10377 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10378 else
10379 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10380 case PyUnicode_2BYTE_KIND:
10381 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10382 case PyUnicode_4BYTE_KIND:
10383 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10384 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010385 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010386}
10387
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010388static void
10389replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10390 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10391{
10392 int kind = PyUnicode_KIND(u);
10393 void *data = PyUnicode_DATA(u);
10394 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10395 if (kind == PyUnicode_1BYTE_KIND) {
10396 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10397 (Py_UCS1 *)data + len,
10398 u1, u2, maxcount);
10399 }
10400 else if (kind == PyUnicode_2BYTE_KIND) {
10401 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10402 (Py_UCS2 *)data + len,
10403 u1, u2, maxcount);
10404 }
10405 else {
10406 assert(kind == PyUnicode_4BYTE_KIND);
10407 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10408 (Py_UCS4 *)data + len,
10409 u1, u2, maxcount);
10410 }
10411}
10412
Alexander Belopolsky40018472011-02-26 01:02:56 +000010413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414replace(PyObject *self, PyObject *str1,
10415 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyObject *u;
10418 char *sbuf = PyUnicode_DATA(self);
10419 char *buf1 = PyUnicode_DATA(str1);
10420 char *buf2 = PyUnicode_DATA(str2);
10421 int srelease = 0, release1 = 0, release2 = 0;
10422 int skind = PyUnicode_KIND(self);
10423 int kind1 = PyUnicode_KIND(str1);
10424 int kind2 = PyUnicode_KIND(str2);
10425 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10426 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10427 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010429 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
10431 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010434 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
Victor Stinner59de0ee2011-10-07 10:01:28 +020010436 if (str1 == str2)
10437 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438
Victor Stinner49a0a212011-10-12 23:46:10 +020010439 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010440 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10441 if (maxchar < maxchar_str1)
10442 /* substring too wide to be present */
10443 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010444 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10445 /* Replacing str1 with str2 may cause a maxchar reduction in the
10446 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010447 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010448 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010453 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010456 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010457 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010458
Victor Stinner69ed0f42013-04-09 21:48:24 +020010459 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010460 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010461 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010463 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010467
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010468 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10469 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010470 }
10471 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 int rkind = skind;
10473 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010474 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (kind1 < rkind) {
10477 /* widen substring */
10478 buf1 = _PyUnicode_AsKind(str1, rkind);
10479 if (!buf1) goto error;
10480 release1 = 1;
10481 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010482 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483 if (i < 0)
10484 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (rkind > kind2) {
10486 /* widen replacement */
10487 buf2 = _PyUnicode_AsKind(str2, rkind);
10488 if (!buf2) goto error;
10489 release2 = 1;
10490 }
10491 else if (rkind < kind2) {
10492 /* widen self and buf1 */
10493 rkind = kind2;
10494 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010495 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 sbuf = _PyUnicode_AsKind(self, rkind);
10497 if (!sbuf) goto error;
10498 srelease = 1;
10499 buf1 = _PyUnicode_AsKind(str1, rkind);
10500 if (!buf1) goto error;
10501 release1 = 1;
10502 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 u = PyUnicode_New(slen, maxchar);
10504 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 assert(PyUnicode_KIND(u) == rkind);
10507 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010508
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010509 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010510 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010513 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010515
10516 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010518 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010520 if (i == -1)
10521 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010524 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010528 }
10529 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010531 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 int rkind = skind;
10533 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 buf1 = _PyUnicode_AsKind(str1, rkind);
10538 if (!buf1) goto error;
10539 release1 = 1;
10540 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010541 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 if (n == 0)
10543 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010545 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 buf2 = _PyUnicode_AsKind(str2, rkind);
10547 if (!buf2) goto error;
10548 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010551 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 rkind = kind2;
10553 sbuf = _PyUnicode_AsKind(self, rkind);
10554 if (!sbuf) goto error;
10555 srelease = 1;
10556 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010557 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 buf1 = _PyUnicode_AsKind(str1, rkind);
10559 if (!buf1) goto error;
10560 release1 = 1;
10561 }
10562 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10563 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010564 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 PyErr_SetString(PyExc_OverflowError,
10566 "replace string is too long");
10567 goto error;
10568 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010569 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010570 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010571 _Py_INCREF_UNICODE_EMPTY();
10572 if (!unicode_empty)
10573 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 u = unicode_empty;
10575 goto done;
10576 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010577 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 PyErr_SetString(PyExc_OverflowError,
10579 "replace string is too long");
10580 goto error;
10581 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 u = PyUnicode_New(new_size, maxchar);
10583 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010585 assert(PyUnicode_KIND(u) == rkind);
10586 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 ires = i = 0;
10588 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589 while (n-- > 0) {
10590 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010593 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010594 if (j == -1)
10595 break;
10596 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010598 memcpy(res + rkind * ires,
10599 sbuf + rkind * i,
10600 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 }
10603 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010605 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010607 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010614 memcpy(res + rkind * ires,
10615 sbuf + rkind * i,
10616 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 }
10618 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619 /* interleave */
10620 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 if (--n <= 0)
10626 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010627 memcpy(res + rkind * ires,
10628 sbuf + rkind * i,
10629 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 ires++;
10631 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 memcpy(res + rkind * ires,
10634 sbuf + rkind * i,
10635 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010637 }
10638
10639 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010640 unicode_adjust_maxchar(&u);
10641 if (u == NULL)
10642 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010644
10645 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (srelease)
10647 PyMem_FREE(sbuf);
10648 if (release1)
10649 PyMem_FREE(buf1);
10650 if (release2)
10651 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010652 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (srelease)
10658 PyMem_FREE(sbuf);
10659 if (release1)
10660 PyMem_FREE(buf1);
10661 if (release2)
10662 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010663 return unicode_result_unchanged(self);
10664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 error:
10666 if (srelease && sbuf)
10667 PyMem_FREE(sbuf);
10668 if (release1 && buf1)
10669 PyMem_FREE(buf1);
10670 if (release2 && buf2)
10671 PyMem_FREE(buf2);
10672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673}
10674
10675/* --- Unicode Object Methods --------------------------------------------- */
10676
INADA Naoki3ae20562017-01-16 20:41:20 +090010677/*[clinic input]
10678str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
INADA Naoki3ae20562017-01-16 20:41:20 +090010680Return a version of the string where each word is titlecased.
10681
10682More specifically, words start with uppercased characters and all remaining
10683cased characters have lower case.
10684[clinic start generated code]*/
10685
10686static PyObject *
10687unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010688/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010690 if (PyUnicode_READY(self) == -1)
10691 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010692 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693}
10694
INADA Naoki3ae20562017-01-16 20:41:20 +090010695/*[clinic input]
10696str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697
INADA Naoki3ae20562017-01-16 20:41:20 +090010698Return a capitalized version of the string.
10699
10700More specifically, make the first character have upper case and the rest lower
10701case.
10702[clinic start generated code]*/
10703
10704static PyObject *
10705unicode_capitalize_impl(PyObject *self)
10706/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010708 if (PyUnicode_READY(self) == -1)
10709 return NULL;
10710 if (PyUnicode_GET_LENGTH(self) == 0)
10711 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010712 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713}
10714
INADA Naoki3ae20562017-01-16 20:41:20 +090010715/*[clinic input]
10716str.casefold as unicode_casefold
10717
10718Return a version of the string suitable for caseless comparisons.
10719[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010720
10721static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010722unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010723/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010724{
10725 if (PyUnicode_READY(self) == -1)
10726 return NULL;
10727 if (PyUnicode_IS_ASCII(self))
10728 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010729 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010730}
10731
10732
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010733/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010734
10735static int
10736convert_uc(PyObject *obj, void *addr)
10737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010739
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010740 if (!PyUnicode_Check(obj)) {
10741 PyErr_Format(PyExc_TypeError,
10742 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010743 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744 return 0;
10745 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010746 if (PyUnicode_READY(obj) < 0)
10747 return 0;
10748 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010751 return 0;
10752 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010753 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010754 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010755}
10756
INADA Naoki3ae20562017-01-16 20:41:20 +090010757/*[clinic input]
10758str.center as unicode_center
10759
10760 width: Py_ssize_t
10761 fillchar: Py_UCS4 = ' '
10762 /
10763
10764Return a centered string of length width.
10765
10766Padding is done using the specified fill character (default is a space).
10767[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
10769static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010770unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10771/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010773 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
Benjamin Petersonbac79492012-01-14 13:34:47 -050010775 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 return NULL;
10777
Victor Stinnerc4b49542011-12-11 22:44:26 +010010778 if (PyUnicode_GET_LENGTH(self) >= width)
10779 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780
Victor Stinnerc4b49542011-12-11 22:44:26 +010010781 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 left = marg / 2 + (marg & width & 1);
10783
Victor Stinner9310abb2011-10-05 00:59:23 +020010784 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785}
10786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787/* This function assumes that str1 and str2 are readied by the caller. */
10788
Marc-André Lemburge5034372000-08-08 08:04:29 +000010789static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010790unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010791{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010792#define COMPARE(TYPE1, TYPE2) \
10793 do { \
10794 TYPE1* p1 = (TYPE1 *)data1; \
10795 TYPE2* p2 = (TYPE2 *)data2; \
10796 TYPE1* end = p1 + len; \
10797 Py_UCS4 c1, c2; \
10798 for (; p1 != end; p1++, p2++) { \
10799 c1 = *p1; \
10800 c2 = *p2; \
10801 if (c1 != c2) \
10802 return (c1 < c2) ? -1 : 1; \
10803 } \
10804 } \
10805 while (0)
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 int kind1, kind2;
10808 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010809 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 kind1 = PyUnicode_KIND(str1);
10812 kind2 = PyUnicode_KIND(str2);
10813 data1 = PyUnicode_DATA(str1);
10814 data2 = PyUnicode_DATA(str2);
10815 len1 = PyUnicode_GET_LENGTH(str1);
10816 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010817 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010818
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010819 switch(kind1) {
10820 case PyUnicode_1BYTE_KIND:
10821 {
10822 switch(kind2) {
10823 case PyUnicode_1BYTE_KIND:
10824 {
10825 int cmp = memcmp(data1, data2, len);
10826 /* normalize result of memcmp() into the range [-1; 1] */
10827 if (cmp < 0)
10828 return -1;
10829 if (cmp > 0)
10830 return 1;
10831 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010832 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010833 case PyUnicode_2BYTE_KIND:
10834 COMPARE(Py_UCS1, Py_UCS2);
10835 break;
10836 case PyUnicode_4BYTE_KIND:
10837 COMPARE(Py_UCS1, Py_UCS4);
10838 break;
10839 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010840 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010841 }
10842 break;
10843 }
10844 case PyUnicode_2BYTE_KIND:
10845 {
10846 switch(kind2) {
10847 case PyUnicode_1BYTE_KIND:
10848 COMPARE(Py_UCS2, Py_UCS1);
10849 break;
10850 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010851 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010852 COMPARE(Py_UCS2, Py_UCS2);
10853 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010854 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010855 case PyUnicode_4BYTE_KIND:
10856 COMPARE(Py_UCS2, Py_UCS4);
10857 break;
10858 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010859 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010860 }
10861 break;
10862 }
10863 case PyUnicode_4BYTE_KIND:
10864 {
10865 switch(kind2) {
10866 case PyUnicode_1BYTE_KIND:
10867 COMPARE(Py_UCS4, Py_UCS1);
10868 break;
10869 case PyUnicode_2BYTE_KIND:
10870 COMPARE(Py_UCS4, Py_UCS2);
10871 break;
10872 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010873 {
10874#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10875 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10876 /* normalize result of wmemcmp() into the range [-1; 1] */
10877 if (cmp < 0)
10878 return -1;
10879 if (cmp > 0)
10880 return 1;
10881#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010883#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010885 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010887 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888 }
10889 break;
10890 }
10891 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010892 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010893 }
10894
Victor Stinner770e19e2012-10-04 22:59:45 +020010895 if (len1 == len2)
10896 return 0;
10897 if (len1 < len2)
10898 return -1;
10899 else
10900 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901
10902#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010903}
10904
Benjamin Peterson621b4302016-09-09 13:54:34 -070010905static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010906unicode_compare_eq(PyObject *str1, PyObject *str2)
10907{
10908 int kind;
10909 void *data1, *data2;
10910 Py_ssize_t len;
10911 int cmp;
10912
Victor Stinnere5567ad2012-10-23 02:48:49 +020010913 len = PyUnicode_GET_LENGTH(str1);
10914 if (PyUnicode_GET_LENGTH(str2) != len)
10915 return 0;
10916 kind = PyUnicode_KIND(str1);
10917 if (PyUnicode_KIND(str2) != kind)
10918 return 0;
10919 data1 = PyUnicode_DATA(str1);
10920 data2 = PyUnicode_DATA(str2);
10921
10922 cmp = memcmp(data1, data2, len * kind);
10923 return (cmp == 0);
10924}
10925
10926
Alexander Belopolsky40018472011-02-26 01:02:56 +000010927int
10928PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10931 if (PyUnicode_READY(left) == -1 ||
10932 PyUnicode_READY(right) == -1)
10933 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010934
10935 /* a string is equal to itself */
10936 if (left == right)
10937 return 0;
10938
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010939 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010941 PyErr_Format(PyExc_TypeError,
10942 "Can't compare %.100s and %.100s",
10943 left->ob_type->tp_name,
10944 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 return -1;
10946}
10947
Martin v. Löwis5b222132007-06-10 09:51:05 +000010948int
10949PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 Py_ssize_t i;
10952 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010954 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955
Victor Stinner910337b2011-10-03 03:20:16 +020010956 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010957 if (!PyUnicode_IS_READY(uni)) {
10958 const wchar_t *ws = _PyUnicode_WSTR(uni);
10959 /* Compare Unicode string and source character set string */
10960 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10961 if (chr != ustr[i])
10962 return (chr < ustr[i]) ? -1 : 1;
10963 }
10964 /* This check keeps Python strings that end in '\0' from comparing equal
10965 to C strings identical up to that point. */
10966 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10967 return 1; /* uni is longer */
10968 if (ustr[i])
10969 return -1; /* str is longer */
10970 return 0;
10971 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010973 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010974 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010975 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010976 size_t len, len2 = strlen(str);
10977 int cmp;
10978
10979 len = Py_MIN(len1, len2);
10980 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010981 if (cmp != 0) {
10982 if (cmp < 0)
10983 return -1;
10984 else
10985 return 1;
10986 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010987 if (len1 > len2)
10988 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010989 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010990 return -1; /* str is longer */
10991 return 0;
10992 }
10993 else {
10994 void *data = PyUnicode_DATA(uni);
10995 /* Compare Unicode string and source character set string */
10996 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010997 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010998 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10999 /* This check keeps Python strings that end in '\0' from comparing equal
11000 to C strings identical up to that point. */
11001 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11002 return 1; /* uni is longer */
11003 if (str[i])
11004 return -1; /* str is longer */
11005 return 0;
11006 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011007}
11008
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011009static int
11010non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11011{
11012 size_t i, len;
11013 const wchar_t *p;
11014 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11015 if (strlen(str) != len)
11016 return 0;
11017 p = _PyUnicode_WSTR(unicode);
11018 assert(p);
11019 for (i = 0; i < len; i++) {
11020 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011021 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011022 return 0;
11023 }
11024 return 1;
11025}
11026
11027int
11028_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11029{
11030 size_t len;
11031 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011032 assert(str);
11033#ifndef NDEBUG
11034 for (const char *p = str; *p; p++) {
11035 assert((unsigned char)*p < 128);
11036 }
11037#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011038 if (PyUnicode_READY(unicode) == -1) {
11039 /* Memory error or bad data */
11040 PyErr_Clear();
11041 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11042 }
11043 if (!PyUnicode_IS_ASCII(unicode))
11044 return 0;
11045 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11046 return strlen(str) == len &&
11047 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11048}
11049
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011050int
11051_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11052{
11053 PyObject *right_uni;
11054 Py_hash_t hash;
11055
11056 assert(_PyUnicode_CHECK(left));
11057 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011058#ifndef NDEBUG
11059 for (const char *p = right->string; *p; p++) {
11060 assert((unsigned char)*p < 128);
11061 }
11062#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011063
11064 if (PyUnicode_READY(left) == -1) {
11065 /* memory error or bad data */
11066 PyErr_Clear();
11067 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11068 }
11069
11070 if (!PyUnicode_IS_ASCII(left))
11071 return 0;
11072
11073 right_uni = _PyUnicode_FromId(right); /* borrowed */
11074 if (right_uni == NULL) {
11075 /* memory error or bad data */
11076 PyErr_Clear();
11077 return _PyUnicode_EqualToASCIIString(left, right->string);
11078 }
11079
11080 if (left == right_uni)
11081 return 1;
11082
11083 if (PyUnicode_CHECK_INTERNED(left))
11084 return 0;
11085
INADA Naoki7cc95f52018-01-28 02:07:09 +090011086 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011087 hash = _PyUnicode_HASH(left);
11088 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11089 return 0;
11090
11091 return unicode_compare_eq(left, right_uni);
11092}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011093
Alexander Belopolsky40018472011-02-26 01:02:56 +000011094PyObject *
11095PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011096{
11097 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011098
Victor Stinnere5567ad2012-10-23 02:48:49 +020011099 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11100 Py_RETURN_NOTIMPLEMENTED;
11101
11102 if (PyUnicode_READY(left) == -1 ||
11103 PyUnicode_READY(right) == -1)
11104 return NULL;
11105
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011106 if (left == right) {
11107 switch (op) {
11108 case Py_EQ:
11109 case Py_LE:
11110 case Py_GE:
11111 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011112 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011113 case Py_NE:
11114 case Py_LT:
11115 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011116 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011117 default:
11118 PyErr_BadArgument();
11119 return NULL;
11120 }
11121 }
11122 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011123 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011124 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011125 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011126 }
11127 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011128 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011129 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011130 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011131}
11132
Alexander Belopolsky40018472011-02-26 01:02:56 +000011133int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011134_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11135{
11136 return unicode_eq(aa, bb);
11137}
11138
11139int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011140PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011141{
Victor Stinner77282cb2013-04-14 19:22:47 +020011142 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 void *buf1, *buf2;
11144 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011145 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011146
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011147 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011149 "'in <string>' requires string as left operand, not %.100s",
11150 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011151 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011152 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011153 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011154 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011155 if (ensure_unicode(str) < 0)
11156 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 kind2 = PyUnicode_KIND(substr);
11160 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 len2 = PyUnicode_GET_LENGTH(substr);
11164 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011165 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011167 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011168 if (len2 == 1) {
11169 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11170 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011171 return result;
11172 }
11173 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011174 buf2 = _PyUnicode_AsKind(substr, kind1);
11175 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011176 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178
Victor Stinner77282cb2013-04-14 19:22:47 +020011179 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 case PyUnicode_1BYTE_KIND:
11181 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11182 break;
11183 case PyUnicode_2BYTE_KIND:
11184 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11185 break;
11186 case PyUnicode_4BYTE_KIND:
11187 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11188 break;
11189 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011190 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192
Victor Stinner77282cb2013-04-14 19:22:47 +020011193 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 PyMem_Free(buf2);
11195
Guido van Rossum403d68b2000-03-13 15:55:09 +000011196 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011197}
11198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199/* Concat to string or Unicode object giving a new Unicode object. */
11200
Alexander Belopolsky40018472011-02-26 01:02:56 +000011201PyObject *
11202PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011205 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011206 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011208 if (ensure_unicode(left) < 0)
11209 return NULL;
11210
11211 if (!PyUnicode_Check(right)) {
11212 PyErr_Format(PyExc_TypeError,
11213 "can only concatenate str (not \"%.200s\") to str",
11214 right->ob_type->tp_name);
11215 return NULL;
11216 }
11217 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
11220 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 if (left == unicode_empty)
11222 return PyUnicode_FromObject(right);
11223 if (right == unicode_empty)
11224 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011226 left_len = PyUnicode_GET_LENGTH(left);
11227 right_len = PyUnicode_GET_LENGTH(right);
11228 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011229 PyErr_SetString(PyExc_OverflowError,
11230 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011232 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011234
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11236 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011237 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 result = PyUnicode_New(new_len, maxchar);
11241 if (result == NULL)
11242 return NULL;
11243 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11244 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11245 assert(_PyUnicode_CheckConsistency(result, 1));
11246 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247}
11248
Walter Dörwald1ab83302007-05-18 17:15:44 +000011249void
Victor Stinner23e56682011-10-03 03:54:37 +020011250PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011251{
Victor Stinner23e56682011-10-03 03:54:37 +020011252 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011253 Py_UCS4 maxchar, maxchar2;
11254 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011255
11256 if (p_left == NULL) {
11257 if (!PyErr_Occurred())
11258 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011259 return;
11260 }
Victor Stinner23e56682011-10-03 03:54:37 +020011261 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011262 if (right == NULL || left == NULL
11263 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011264 if (!PyErr_Occurred())
11265 PyErr_BadInternalCall();
11266 goto error;
11267 }
11268
Benjamin Petersonbac79492012-01-14 13:34:47 -050011269 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011270 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011271 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011272 goto error;
11273
Victor Stinner488fa492011-12-12 00:01:39 +010011274 /* Shortcuts */
11275 if (left == unicode_empty) {
11276 Py_DECREF(left);
11277 Py_INCREF(right);
11278 *p_left = right;
11279 return;
11280 }
11281 if (right == unicode_empty)
11282 return;
11283
11284 left_len = PyUnicode_GET_LENGTH(left);
11285 right_len = PyUnicode_GET_LENGTH(right);
11286 if (left_len > PY_SSIZE_T_MAX - right_len) {
11287 PyErr_SetString(PyExc_OverflowError,
11288 "strings are too large to concat");
11289 goto error;
11290 }
11291 new_len = left_len + right_len;
11292
11293 if (unicode_modifiable(left)
11294 && PyUnicode_CheckExact(right)
11295 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011296 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11297 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011298 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011299 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011300 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11301 {
11302 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011303 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011304 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011305
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011306 /* copy 'right' into the newly allocated area of 'left' */
11307 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011308 }
Victor Stinner488fa492011-12-12 00:01:39 +010011309 else {
11310 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11311 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011312 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011313
Victor Stinner488fa492011-12-12 00:01:39 +010011314 /* Concat the two Unicode strings */
11315 res = PyUnicode_New(new_len, maxchar);
11316 if (res == NULL)
11317 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011318 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11319 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011320 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011321 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011322 }
11323 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011324 return;
11325
11326error:
Victor Stinner488fa492011-12-12 00:01:39 +010011327 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011328}
11329
11330void
11331PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011333 PyUnicode_Append(pleft, right);
11334 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011335}
11336
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011337/*
11338Wraps stringlib_parse_args_finds() and additionally ensures that the
11339first argument is a unicode object.
11340*/
11341
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011342static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011343parse_args_finds_unicode(const char * function_name, PyObject *args,
11344 PyObject **substring,
11345 Py_ssize_t *start, Py_ssize_t *end)
11346{
11347 if(stringlib_parse_args_finds(function_name, args, substring,
11348 start, end)) {
11349 if (ensure_unicode(*substring) < 0)
11350 return 0;
11351 return 1;
11352 }
11353 return 0;
11354}
11355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011359Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011360string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011361interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
11363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011364unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011366 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011367 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011368 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011370 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 void *buf1, *buf2;
11372 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011374 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 kind1 = PyUnicode_KIND(self);
11378 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011380 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 len1 = PyUnicode_GET_LENGTH(self);
11383 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011385 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011386 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011387
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011388 buf1 = PyUnicode_DATA(self);
11389 buf2 = PyUnicode_DATA(substring);
11390 if (kind2 != kind1) {
11391 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011392 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011393 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011394 }
11395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 case PyUnicode_1BYTE_KIND:
11397 iresult = ucs1lib_count(
11398 ((Py_UCS1*)buf1) + start, end - start,
11399 buf2, len2, PY_SSIZE_T_MAX
11400 );
11401 break;
11402 case PyUnicode_2BYTE_KIND:
11403 iresult = ucs2lib_count(
11404 ((Py_UCS2*)buf1) + start, end - start,
11405 buf2, len2, PY_SSIZE_T_MAX
11406 );
11407 break;
11408 case PyUnicode_4BYTE_KIND:
11409 iresult = ucs4lib_count(
11410 ((Py_UCS4*)buf1) + start, end - start,
11411 buf2, len2, PY_SSIZE_T_MAX
11412 );
11413 break;
11414 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011415 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 }
11417
11418 result = PyLong_FromSsize_t(iresult);
11419
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011420 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return result;
11424}
11425
INADA Naoki3ae20562017-01-16 20:41:20 +090011426/*[clinic input]
11427str.encode as unicode_encode
11428
11429 encoding: str(c_default="NULL") = 'utf-8'
11430 The encoding in which to encode the string.
11431 errors: str(c_default="NULL") = 'strict'
11432 The error handling scheme to use for encoding errors.
11433 The default is 'strict' meaning that encoding errors raise a
11434 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11435 'xmlcharrefreplace' as well as any other name registered with
11436 codecs.register_error that can handle UnicodeEncodeErrors.
11437
11438Encode the string using the codec registered for encoding.
11439[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011442unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011443/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011445 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011446}
11447
INADA Naoki3ae20562017-01-16 20:41:20 +090011448/*[clinic input]
11449str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
INADA Naoki3ae20562017-01-16 20:41:20 +090011451 tabsize: int = 8
11452
11453Return a copy where all tab characters are expanded using spaces.
11454
11455If tabsize is not given, a tab size of 8 characters is assumed.
11456[clinic start generated code]*/
11457
11458static PyObject *
11459unicode_expandtabs_impl(PyObject *self, int tabsize)
11460/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 Py_ssize_t i, j, line_pos, src_len, incr;
11463 Py_UCS4 ch;
11464 PyObject *u;
11465 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011466 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011467 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Antoine Pitrou22425222011-10-04 19:10:51 +020011469 if (PyUnicode_READY(self) == -1)
11470 return NULL;
11471
Thomas Wouters7e474022000-07-16 12:04:32 +000011472 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 src_len = PyUnicode_GET_LENGTH(self);
11474 i = j = line_pos = 0;
11475 kind = PyUnicode_KIND(self);
11476 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011477 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 for (; i < src_len; i++) {
11479 ch = PyUnicode_READ(kind, src_data, i);
11480 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011481 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 goto overflow;
11486 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011488 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 goto overflow;
11493 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011495 if (ch == '\n' || ch == '\r')
11496 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011499 if (!found)
11500 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011501
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011503 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 if (!u)
11505 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Antoine Pitroue71d5742011-10-04 15:55:09 +020011508 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 for (; i < src_len; i++) {
11511 ch = PyUnicode_READ(kind, src_data, i);
11512 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011514 incr = tabsize - (line_pos % tabsize);
11515 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011516 FILL(kind, dest_data, ' ', j, incr);
11517 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011519 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011521 line_pos++;
11522 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011523 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011524 if (ch == '\n' || ch == '\r')
11525 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 }
11528 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011529 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011530
Antoine Pitroue71d5742011-10-04 15:55:09 +020011531 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011532 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534}
11535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011536PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538\n\
11539Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011540such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541arguments start and end are interpreted as in slice notation.\n\
11542\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011543Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
11545static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011548 /* initialize variables to prevent gcc warning */
11549 PyObject *substring = NULL;
11550 Py_ssize_t start = 0;
11551 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011552 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011554 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011557 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011560 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (result == -2)
11563 return NULL;
11564
Christian Heimes217cfd12007-12-02 14:31:20 +000011565 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566}
11567
11568static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011569unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011571 void *data;
11572 enum PyUnicode_Kind kind;
11573 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011574
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011575 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011576 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011578 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011579 if (PyUnicode_READY(self) == -1) {
11580 return NULL;
11581 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011582 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11583 PyErr_SetString(PyExc_IndexError, "string index out of range");
11584 return NULL;
11585 }
11586 kind = PyUnicode_KIND(self);
11587 data = PyUnicode_DATA(self);
11588 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011589 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590}
11591
Guido van Rossumc2504932007-09-18 19:42:40 +000011592/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011593 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011594static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011595unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596{
Guido van Rossumc2504932007-09-18 19:42:40 +000011597 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011598 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011599
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011600#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011601 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011602#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 if (_PyUnicode_HASH(self) != -1)
11604 return _PyUnicode_HASH(self);
11605 if (PyUnicode_READY(self) == -1)
11606 return -1;
11607 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011608 /*
11609 We make the hash of the empty string be 0, rather than using
11610 (prefix ^ suffix), since this slightly obfuscates the hash secret
11611 */
11612 if (len == 0) {
11613 _PyUnicode_HASH(self) = 0;
11614 return 0;
11615 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011616 x = _Py_HashBytes(PyUnicode_DATA(self),
11617 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011619 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620}
11621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011622PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624\n\
oldkaa0735f2018-02-02 16:52:55 +080011625Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011626such that sub is contained within S[start:end]. Optional\n\
11627arguments start and end are interpreted as in slice notation.\n\
11628\n\
11629Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
11631static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011634 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011635 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011636 PyObject *substring = NULL;
11637 Py_ssize_t start = 0;
11638 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011640 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (result == -2)
11649 return NULL;
11650
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651 if (result < 0) {
11652 PyErr_SetString(PyExc_ValueError, "substring not found");
11653 return NULL;
11654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011655
Christian Heimes217cfd12007-12-02 14:31:20 +000011656 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
INADA Naoki3ae20562017-01-16 20:41:20 +090011659/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011660str.isascii as unicode_isascii
11661
11662Return True if all characters in the string are ASCII, False otherwise.
11663
11664ASCII characters have code points in the range U+0000-U+007F.
11665Empty string is ASCII too.
11666[clinic start generated code]*/
11667
11668static PyObject *
11669unicode_isascii_impl(PyObject *self)
11670/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11671{
11672 if (PyUnicode_READY(self) == -1) {
11673 return NULL;
11674 }
11675 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11676}
11677
11678/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011679str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
INADA Naoki3ae20562017-01-16 20:41:20 +090011681Return True if the string is a lowercase string, False otherwise.
11682
11683A string is lowercase if all cased characters in the string are lowercase and
11684there is at least one cased character in the string.
11685[clinic start generated code]*/
11686
11687static PyObject *
11688unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011689/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 Py_ssize_t i, length;
11692 int kind;
11693 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694 int cased;
11695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698 length = PyUnicode_GET_LENGTH(self);
11699 kind = PyUnicode_KIND(self);
11700 data = PyUnicode_DATA(self);
11701
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (length == 1)
11704 return PyBool_FromLong(
11705 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011707 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011709 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011710
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 for (i = 0; i < length; i++) {
11713 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011714
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011716 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 else if (!cased && Py_UNICODE_ISLOWER(ch))
11718 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011720 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
INADA Naoki3ae20562017-01-16 20:41:20 +090011723/*[clinic input]
11724str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
INADA Naoki3ae20562017-01-16 20:41:20 +090011726Return True if the string is an uppercase string, False otherwise.
11727
11728A string is uppercase if all cased characters in the string are uppercase and
11729there is at least one cased character in the string.
11730[clinic start generated code]*/
11731
11732static PyObject *
11733unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011734/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 Py_ssize_t i, length;
11737 int kind;
11738 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 int cased;
11740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (PyUnicode_READY(self) == -1)
11742 return NULL;
11743 length = PyUnicode_GET_LENGTH(self);
11744 kind = PyUnicode_KIND(self);
11745 data = PyUnicode_DATA(self);
11746
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (length == 1)
11749 return PyBool_FromLong(
11750 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011754 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011755
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 for (i = 0; i < length; i++) {
11758 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011759
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011761 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 else if (!cased && Py_UNICODE_ISUPPER(ch))
11763 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011765 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766}
11767
INADA Naoki3ae20562017-01-16 20:41:20 +090011768/*[clinic input]
11769str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
INADA Naoki3ae20562017-01-16 20:41:20 +090011771Return True if the string is a title-cased string, False otherwise.
11772
11773In a title-cased string, upper- and title-case characters may only
11774follow uncased characters and lowercase characters only cased ones.
11775[clinic start generated code]*/
11776
11777static PyObject *
11778unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011779/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 Py_ssize_t i, length;
11782 int kind;
11783 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 int cased, previous_is_cased;
11785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (PyUnicode_READY(self) == -1)
11787 return NULL;
11788 length = PyUnicode_GET_LENGTH(self);
11789 kind = PyUnicode_KIND(self);
11790 data = PyUnicode_DATA(self);
11791
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (length == 1) {
11794 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11795 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11796 (Py_UNICODE_ISUPPER(ch) != 0));
11797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011799 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011801 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011802
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 cased = 0;
11804 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 for (i = 0; i < length; i++) {
11806 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011807
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11809 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011810 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 previous_is_cased = 1;
11812 cased = 1;
11813 }
11814 else if (Py_UNICODE_ISLOWER(ch)) {
11815 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011816 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 previous_is_cased = 1;
11818 cased = 1;
11819 }
11820 else
11821 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011823 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824}
11825
INADA Naoki3ae20562017-01-16 20:41:20 +090011826/*[clinic input]
11827str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
INADA Naoki3ae20562017-01-16 20:41:20 +090011829Return True if the string is a whitespace string, False otherwise.
11830
11831A string is whitespace if all characters in the string are whitespace and there
11832is at least one character in the string.
11833[clinic start generated code]*/
11834
11835static PyObject *
11836unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011837/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 Py_ssize_t i, length;
11840 int kind;
11841 void *data;
11842
11843 if (PyUnicode_READY(self) == -1)
11844 return NULL;
11845 length = PyUnicode_GET_LENGTH(self);
11846 kind = PyUnicode_KIND(self);
11847 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (length == 1)
11851 return PyBool_FromLong(
11852 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011854 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011856 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 for (i = 0; i < length; i++) {
11859 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011860 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011861 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011863 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864}
11865
INADA Naoki3ae20562017-01-16 20:41:20 +090011866/*[clinic input]
11867str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869Return True if the string is an alphabetic string, False otherwise.
11870
11871A string is alphabetic if all characters in the string are alphabetic and there
11872is at least one character in the string.
11873[clinic start generated code]*/
11874
11875static PyObject *
11876unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011877/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 Py_ssize_t i, length;
11880 int kind;
11881 void *data;
11882
11883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011888
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011893
11894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 for (i = 0; i < length; i++) {
11899 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011900 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011901 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011902 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011903}
11904
INADA Naoki3ae20562017-01-16 20:41:20 +090011905/*[clinic input]
11906str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908Return True if the string is an alpha-numeric string, False otherwise.
11909
11910A string is alpha-numeric if all characters in the string are alpha-numeric and
11911there is at least one character in the string.
11912[clinic start generated code]*/
11913
11914static PyObject *
11915unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011916/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 int kind;
11919 void *data;
11920 Py_ssize_t len, i;
11921
11922 if (PyUnicode_READY(self) == -1)
11923 return NULL;
11924
11925 kind = PyUnicode_KIND(self);
11926 data = PyUnicode_DATA(self);
11927 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011928
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (len == 1) {
11931 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11932 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11933 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011934
11935 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011937 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 for (i = 0; i < len; i++) {
11940 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011941 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011943 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011944 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945}
11946
INADA Naoki3ae20562017-01-16 20:41:20 +090011947/*[clinic input]
11948str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
INADA Naoki3ae20562017-01-16 20:41:20 +090011950Return True if the string is a decimal string, False otherwise.
11951
11952A string is a decimal string if all characters in the string are decimal and
11953there is at least one character in the string.
11954[clinic start generated code]*/
11955
11956static PyObject *
11957unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011958/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 Py_ssize_t i, length;
11961 int kind;
11962 void *data;
11963
11964 if (PyUnicode_READY(self) == -1)
11965 return NULL;
11966 length = PyUnicode_GET_LENGTH(self);
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (length == 1)
11972 return PyBool_FromLong(
11973 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011975 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011977 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 for (i = 0; i < length; i++) {
11980 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011981 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011983 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984}
11985
INADA Naoki3ae20562017-01-16 20:41:20 +090011986/*[clinic input]
11987str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
INADA Naoki3ae20562017-01-16 20:41:20 +090011989Return True if the string is a digit string, False otherwise.
11990
11991A string is a digit string if all characters in the string are digits and there
11992is at least one character in the string.
11993[clinic start generated code]*/
11994
11995static PyObject *
11996unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011997/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 Py_ssize_t i, length;
12000 int kind;
12001 void *data;
12002
12003 if (PyUnicode_READY(self) == -1)
12004 return NULL;
12005 length = PyUnicode_GET_LENGTH(self);
12006 kind = PyUnicode_KIND(self);
12007 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (length == 1) {
12011 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12012 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 for (i = 0; i < length; i++) {
12020 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012021 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012023 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024}
12025
INADA Naoki3ae20562017-01-16 20:41:20 +090012026/*[clinic input]
12027str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
INADA Naoki3ae20562017-01-16 20:41:20 +090012029Return True if the string is a numeric string, False otherwise.
12030
12031A string is numeric if all characters in the string are numeric and there is at
12032least one character in the string.
12033[clinic start generated code]*/
12034
12035static PyObject *
12036unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012037/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 Py_ssize_t i, length;
12040 int kind;
12041 void *data;
12042
12043 if (PyUnicode_READY(self) == -1)
12044 return NULL;
12045 length = PyUnicode_GET_LENGTH(self);
12046 kind = PyUnicode_KIND(self);
12047 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (length == 1)
12051 return PyBool_FromLong(
12052 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012054 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 for (i = 0; i < length; i++) {
12059 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012060 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012062 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063}
12064
Martin v. Löwis47383402007-08-15 07:32:56 +000012065int
12066PyUnicode_IsIdentifier(PyObject *self)
12067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 int kind;
12069 void *data;
12070 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012071 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 if (PyUnicode_READY(self) == -1) {
12074 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 }
12077
12078 /* Special case for empty strings */
12079 if (PyUnicode_GET_LENGTH(self) == 0)
12080 return 0;
12081 kind = PyUnicode_KIND(self);
12082 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012083
12084 /* PEP 3131 says that the first character must be in
12085 XID_Start and subsequent characters in XID_Continue,
12086 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012087 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012088 letters, digits, underscore). However, given the current
12089 definition of XID_Start and XID_Continue, it is sufficient
12090 to check just for these, except that _ must be allowed
12091 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012093 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012094 return 0;
12095
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012096 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012099 return 1;
12100}
12101
INADA Naoki3ae20562017-01-16 20:41:20 +090012102/*[clinic input]
12103str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012104
INADA Naoki3ae20562017-01-16 20:41:20 +090012105Return True if the string is a valid Python identifier, False otherwise.
12106
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012107Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012108such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012109[clinic start generated code]*/
12110
12111static PyObject *
12112unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012113/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012114{
12115 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12116}
12117
INADA Naoki3ae20562017-01-16 20:41:20 +090012118/*[clinic input]
12119str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121Return True if the string is printable, False otherwise.
12122
12123A string is printable if all of its characters are considered printable in
12124repr() or if it is empty.
12125[clinic start generated code]*/
12126
12127static PyObject *
12128unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012129/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 Py_ssize_t i, length;
12132 int kind;
12133 void *data;
12134
12135 if (PyUnicode_READY(self) == -1)
12136 return NULL;
12137 length = PyUnicode_GET_LENGTH(self);
12138 kind = PyUnicode_KIND(self);
12139 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012140
12141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (length == 1)
12143 return PyBool_FromLong(
12144 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 for (i = 0; i < length; i++) {
12147 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012148 Py_RETURN_FALSE;
12149 }
12150 }
12151 Py_RETURN_TRUE;
12152}
12153
INADA Naoki3ae20562017-01-16 20:41:20 +090012154/*[clinic input]
12155str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157 iterable: object
12158 /
12159
12160Concatenate any number of strings.
12161
Martin Panter91a88662017-01-24 00:30:06 +000012162The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012163The result is returned as a new string.
12164
12165Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012170/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
INADA Naoki3ae20562017-01-16 20:41:20 +090012172 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173}
12174
Martin v. Löwis18e16552006-02-15 17:27:45 +000012175static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (PyUnicode_READY(self) == -1)
12179 return -1;
12180 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181}
12182
INADA Naoki3ae20562017-01-16 20:41:20 +090012183/*[clinic input]
12184str.ljust as unicode_ljust
12185
12186 width: Py_ssize_t
12187 fillchar: Py_UCS4 = ' '
12188 /
12189
12190Return a left-justified string of length width.
12191
12192Padding is done using the specified fill character (default is a space).
12193[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
12195static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012196unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12197/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012199 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Victor Stinnerc4b49542011-12-11 22:44:26 +010012202 if (PyUnicode_GET_LENGTH(self) >= width)
12203 return unicode_result_unchanged(self);
12204
12205 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206}
12207
INADA Naoki3ae20562017-01-16 20:41:20 +090012208/*[clinic input]
12209str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
INADA Naoki3ae20562017-01-16 20:41:20 +090012211Return a copy of the string converted to lowercase.
12212[clinic start generated code]*/
12213
12214static PyObject *
12215unicode_lower_impl(PyObject *self)
12216/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012218 if (PyUnicode_READY(self) == -1)
12219 return NULL;
12220 if (PyUnicode_IS_ASCII(self))
12221 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012222 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223}
12224
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012225#define LEFTSTRIP 0
12226#define RIGHTSTRIP 1
12227#define BOTHSTRIP 2
12228
12229/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012230static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012231
INADA Naoki3ae20562017-01-16 20:41:20 +090012232#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012233
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234/* externally visible for str.strip(unicode) */
12235PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012236_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 void *data;
12239 int kind;
12240 Py_ssize_t i, j, len;
12241 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012242 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12245 return NULL;
12246
12247 kind = PyUnicode_KIND(self);
12248 data = PyUnicode_DATA(self);
12249 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012250 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12252 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012253 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254
Benjamin Peterson14339b62009-01-31 16:36:08 +000012255 i = 0;
12256 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012257 while (i < len) {
12258 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12259 if (!BLOOM(sepmask, ch))
12260 break;
12261 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12262 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 i++;
12264 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266
Benjamin Peterson14339b62009-01-31 16:36:08 +000012267 j = len;
12268 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012269 j--;
12270 while (j >= i) {
12271 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12272 if (!BLOOM(sepmask, ch))
12273 break;
12274 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12275 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012277 }
12278
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012280 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281
Victor Stinner7931d9a2011-11-04 00:22:48 +010012282 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283}
12284
12285PyObject*
12286PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12287{
12288 unsigned char *data;
12289 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012290 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291
Victor Stinnerde636f32011-10-01 03:55:54 +020012292 if (PyUnicode_READY(self) == -1)
12293 return NULL;
12294
Victor Stinner684d5fd2012-05-03 02:32:34 +020012295 length = PyUnicode_GET_LENGTH(self);
12296 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012297
Victor Stinner684d5fd2012-05-03 02:32:34 +020012298 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012299 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300
Victor Stinnerde636f32011-10-01 03:55:54 +020012301 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012302 PyErr_SetString(PyExc_IndexError, "string index out of range");
12303 return NULL;
12304 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012305 if (start >= length || end < start)
12306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012307
Victor Stinner684d5fd2012-05-03 02:32:34 +020012308 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012309 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012310 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012311 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012312 }
12313 else {
12314 kind = PyUnicode_KIND(self);
12315 data = PyUnicode_1BYTE_DATA(self);
12316 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012317 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012318 length);
12319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
12322static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012323do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 Py_ssize_t len, i, j;
12326
12327 if (PyUnicode_READY(self) == -1)
12328 return NULL;
12329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331
Victor Stinnercc7af722013-04-09 22:39:24 +020012332 if (PyUnicode_IS_ASCII(self)) {
12333 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12334
12335 i = 0;
12336 if (striptype != RIGHTSTRIP) {
12337 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012338 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012339 if (!_Py_ascii_whitespace[ch])
12340 break;
12341 i++;
12342 }
12343 }
12344
12345 j = len;
12346 if (striptype != LEFTSTRIP) {
12347 j--;
12348 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012349 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012350 if (!_Py_ascii_whitespace[ch])
12351 break;
12352 j--;
12353 }
12354 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012355 }
12356 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012357 else {
12358 int kind = PyUnicode_KIND(self);
12359 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360
Victor Stinnercc7af722013-04-09 22:39:24 +020012361 i = 0;
12362 if (striptype != RIGHTSTRIP) {
12363 while (i < len) {
12364 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12365 if (!Py_UNICODE_ISSPACE(ch))
12366 break;
12367 i++;
12368 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012369 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012370
12371 j = len;
12372 if (striptype != LEFTSTRIP) {
12373 j--;
12374 while (j >= i) {
12375 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12376 if (!Py_UNICODE_ISSPACE(ch))
12377 break;
12378 j--;
12379 }
12380 j++;
12381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012382 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383
Victor Stinner7931d9a2011-11-04 00:22:48 +010012384 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385}
12386
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012387
12388static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012389do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012390{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 if (sep != NULL && sep != Py_None) {
12392 if (PyUnicode_Check(sep))
12393 return _PyUnicode_XStrip(self, striptype, sep);
12394 else {
12395 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 "%s arg must be None or str",
12397 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 return NULL;
12399 }
12400 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012403}
12404
12405
INADA Naoki3ae20562017-01-16 20:41:20 +090012406/*[clinic input]
12407str.strip as unicode_strip
12408
12409 chars: object = None
12410 /
12411
Victor Stinner0c4a8282017-01-17 02:21:47 +010012412Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012413
12414If chars is given and not None, remove characters in chars instead.
12415[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416
12417static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012418unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012419/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420{
INADA Naoki3ae20562017-01-16 20:41:20 +090012421 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012422}
12423
12424
INADA Naoki3ae20562017-01-16 20:41:20 +090012425/*[clinic input]
12426str.lstrip as unicode_lstrip
12427
12428 chars: object = NULL
12429 /
12430
12431Return a copy of the string with leading whitespace removed.
12432
12433If chars is given and not None, remove characters in chars instead.
12434[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435
12436static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012437unicode_lstrip_impl(PyObject *self, PyObject *chars)
12438/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439{
INADA Naoki3ae20562017-01-16 20:41:20 +090012440 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441}
12442
12443
INADA Naoki3ae20562017-01-16 20:41:20 +090012444/*[clinic input]
12445str.rstrip as unicode_rstrip
12446
12447 chars: object = NULL
12448 /
12449
12450Return a copy of the string with trailing whitespace removed.
12451
12452If chars is given and not None, remove characters in chars instead.
12453[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454
12455static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012456unicode_rstrip_impl(PyObject *self, PyObject *chars)
12457/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458{
INADA Naoki3ae20562017-01-16 20:41:20 +090012459 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460}
12461
12462
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012464unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012466 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468
Serhiy Storchaka05997252013-01-26 12:14:02 +020012469 if (len < 1)
12470 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471
Victor Stinnerc4b49542011-12-11 22:44:26 +010012472 /* no repeat, return original string */
12473 if (len == 1)
12474 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012475
Benjamin Petersonbac79492012-01-14 13:34:47 -050012476 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 return NULL;
12478
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012479 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012480 PyErr_SetString(PyExc_OverflowError,
12481 "repeated string is too long");
12482 return NULL;
12483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012485
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012486 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487 if (!u)
12488 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012489 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 if (PyUnicode_GET_LENGTH(str) == 1) {
12492 const int kind = PyUnicode_KIND(str);
12493 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012494 if (kind == PyUnicode_1BYTE_KIND) {
12495 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012496 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012497 }
12498 else if (kind == PyUnicode_2BYTE_KIND) {
12499 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012500 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012501 ucs2[n] = fill_char;
12502 } else {
12503 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12504 assert(kind == PyUnicode_4BYTE_KIND);
12505 for (n = 0; n < len; ++n)
12506 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 }
12509 else {
12510 /* number of characters copied this far */
12511 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012512 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012514 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012518 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012519 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521 }
12522
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012523 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525}
12526
Alexander Belopolsky40018472011-02-26 01:02:56 +000012527PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012528PyUnicode_Replace(PyObject *str,
12529 PyObject *substr,
12530 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012531 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012533 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12534 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012536 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
INADA Naoki3ae20562017-01-16 20:41:20 +090012539/*[clinic input]
12540str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
INADA Naoki3ae20562017-01-16 20:41:20 +090012542 old: unicode
12543 new: unicode
12544 count: Py_ssize_t = -1
12545 Maximum number of occurrences to replace.
12546 -1 (the default value) means replace all occurrences.
12547 /
12548
12549Return a copy with all occurrences of substring old replaced by new.
12550
12551If the optional argument count is given, only the first count occurrences are
12552replaced.
12553[clinic start generated code]*/
12554
12555static PyObject *
12556unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12557 Py_ssize_t count)
12558/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012560 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012562 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
Alexander Belopolsky40018472011-02-26 01:02:56 +000012565static PyObject *
12566unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012568 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 Py_ssize_t isize;
12570 Py_ssize_t osize, squote, dquote, i, o;
12571 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012572 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012576 return NULL;
12577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 isize = PyUnicode_GET_LENGTH(unicode);
12579 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 /* Compute length of output, quote characters, and
12582 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012583 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 max = 127;
12585 squote = dquote = 0;
12586 ikind = PyUnicode_KIND(unicode);
12587 for (i = 0; i < isize; i++) {
12588 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012589 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012591 case '\'': squote++; break;
12592 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012594 incr = 2;
12595 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 default:
12597 /* Fast-path ASCII */
12598 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012599 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012601 ;
12602 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012605 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012607 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012609 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012611 if (osize > PY_SSIZE_T_MAX - incr) {
12612 PyErr_SetString(PyExc_OverflowError,
12613 "string is too long to generate repr");
12614 return NULL;
12615 }
12616 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 }
12618
12619 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012620 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012622 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 if (dquote)
12624 /* Both squote and dquote present. Use squote,
12625 and escape them */
12626 osize += squote;
12627 else
12628 quote = '"';
12629 }
Victor Stinner55c08782013-04-14 18:45:39 +020012630 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631
12632 repr = PyUnicode_New(osize, max);
12633 if (repr == NULL)
12634 return NULL;
12635 okind = PyUnicode_KIND(repr);
12636 odata = PyUnicode_DATA(repr);
12637
12638 PyUnicode_WRITE(okind, odata, 0, quote);
12639 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012640 if (unchanged) {
12641 _PyUnicode_FastCopyCharacters(repr, 1,
12642 unicode, 0,
12643 isize);
12644 }
12645 else {
12646 for (i = 0, o = 1; i < isize; i++) {
12647 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648
Victor Stinner55c08782013-04-14 18:45:39 +020012649 /* Escape quotes and backslashes */
12650 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012651 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012653 continue;
12654 }
12655
12656 /* Map special whitespace to '\t', \n', '\r' */
12657 if (ch == '\t') {
12658 PyUnicode_WRITE(okind, odata, o++, '\\');
12659 PyUnicode_WRITE(okind, odata, o++, 't');
12660 }
12661 else if (ch == '\n') {
12662 PyUnicode_WRITE(okind, odata, o++, '\\');
12663 PyUnicode_WRITE(okind, odata, o++, 'n');
12664 }
12665 else if (ch == '\r') {
12666 PyUnicode_WRITE(okind, odata, o++, '\\');
12667 PyUnicode_WRITE(okind, odata, o++, 'r');
12668 }
12669
12670 /* Map non-printable US ASCII to '\xhh' */
12671 else if (ch < ' ' || ch == 0x7F) {
12672 PyUnicode_WRITE(okind, odata, o++, '\\');
12673 PyUnicode_WRITE(okind, odata, o++, 'x');
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12676 }
12677
12678 /* Copy ASCII characters as-is */
12679 else if (ch < 0x7F) {
12680 PyUnicode_WRITE(okind, odata, o++, ch);
12681 }
12682
12683 /* Non-ASCII characters */
12684 else {
12685 /* Map Unicode whitespace and control characters
12686 (categories Z* and C* except ASCII space)
12687 */
12688 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12689 PyUnicode_WRITE(okind, odata, o++, '\\');
12690 /* Map 8-bit characters to '\xhh' */
12691 if (ch <= 0xff) {
12692 PyUnicode_WRITE(okind, odata, o++, 'x');
12693 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12694 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12695 }
12696 /* Map 16-bit characters to '\uxxxx' */
12697 else if (ch <= 0xffff) {
12698 PyUnicode_WRITE(okind, odata, o++, 'u');
12699 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12700 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12701 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12702 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12703 }
12704 /* Map 21-bit characters to '\U00xxxxxx' */
12705 else {
12706 PyUnicode_WRITE(okind, odata, o++, 'U');
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12714 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12715 }
12716 }
12717 /* Copy characters as-is */
12718 else {
12719 PyUnicode_WRITE(okind, odata, o++, ch);
12720 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012721 }
12722 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012725 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012726 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727}
12728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012729PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731\n\
12732Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012733such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734arguments start and end are interpreted as in slice notation.\n\
12735\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012736Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737
12738static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012741 /* initialize variables to prevent gcc warning */
12742 PyObject *substring = NULL;
12743 Py_ssize_t start = 0;
12744 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012747 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012750 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012753 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 if (result == -2)
12756 return NULL;
12757
Christian Heimes217cfd12007-12-02 14:31:20 +000012758 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759}
12760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012761PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012764Return the highest index in S where substring sub is found,\n\
12765such that sub is contained within S[start:end]. Optional\n\
12766arguments start and end are interpreted as in slice notation.\n\
12767\n\
12768Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769
12770static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012773 /* initialize variables to prevent gcc warning */
12774 PyObject *substring = NULL;
12775 Py_ssize_t start = 0;
12776 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012779 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012782 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012785 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787 if (result == -2)
12788 return NULL;
12789
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790 if (result < 0) {
12791 PyErr_SetString(PyExc_ValueError, "substring not found");
12792 return NULL;
12793 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794
Christian Heimes217cfd12007-12-02 14:31:20 +000012795 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
INADA Naoki3ae20562017-01-16 20:41:20 +090012798/*[clinic input]
12799str.rjust as unicode_rjust
12800
12801 width: Py_ssize_t
12802 fillchar: Py_UCS4 = ' '
12803 /
12804
12805Return a right-justified string of length width.
12806
12807Padding is done using the specified fill character (default is a space).
12808[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012811unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12812/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012814 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815 return NULL;
12816
Victor Stinnerc4b49542011-12-11 22:44:26 +010012817 if (PyUnicode_GET_LENGTH(self) >= width)
12818 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Victor Stinnerc4b49542011-12-11 22:44:26 +010012820 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821}
12822
Alexander Belopolsky40018472011-02-26 01:02:56 +000012823PyObject *
12824PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012826 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830}
12831
INADA Naoki3ae20562017-01-16 20:41:20 +090012832/*[clinic input]
12833str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
INADA Naoki3ae20562017-01-16 20:41:20 +090012835 sep: object = None
12836 The delimiter according which to split the string.
12837 None (the default value) means split according to any whitespace,
12838 and discard empty strings from the result.
12839 maxsplit: Py_ssize_t = -1
12840 Maximum number of splits to do.
12841 -1 (the default value) means no limit.
12842
12843Return a list of the words in the string, using sep as the delimiter string.
12844[clinic start generated code]*/
12845
12846static PyObject *
12847unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12848/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849{
INADA Naoki3ae20562017-01-16 20:41:20 +090012850 if (sep == Py_None)
12851 return split(self, NULL, maxsplit);
12852 if (PyUnicode_Check(sep))
12853 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012854
Victor Stinner998b8062018-09-12 00:23:25 +020012855 PyErr_Format(PyExc_TypeError,
12856 "must be str or None, not %.100s",
12857 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859}
12860
Thomas Wouters477c8d52006-05-27 19:21:47 +000012861PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012862PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012863{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012864 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012865 int kind1, kind2;
12866 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012868
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012869 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871
Victor Stinner14f8f022011-10-05 20:58:25 +020012872 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 len1 = PyUnicode_GET_LENGTH(str_obj);
12875 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012876 if (kind1 < kind2 || len1 < len2) {
12877 _Py_INCREF_UNICODE_EMPTY();
12878 if (!unicode_empty)
12879 out = NULL;
12880 else {
12881 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12882 Py_DECREF(unicode_empty);
12883 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012884 return out;
12885 }
12886 buf1 = PyUnicode_DATA(str_obj);
12887 buf2 = PyUnicode_DATA(sep_obj);
12888 if (kind2 != kind1) {
12889 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12890 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012891 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012894 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012896 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12897 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12898 else
12899 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 break;
12901 case PyUnicode_2BYTE_KIND:
12902 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12903 break;
12904 case PyUnicode_4BYTE_KIND:
12905 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12906 break;
12907 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012908 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012911 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012913
12914 return out;
12915}
12916
12917
12918PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012919PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 int kind1, kind2;
12923 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 len1 = PyUnicode_GET_LENGTH(str_obj);
12932 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012933 if (kind1 < kind2 || len1 < len2) {
12934 _Py_INCREF_UNICODE_EMPTY();
12935 if (!unicode_empty)
12936 out = NULL;
12937 else {
12938 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12939 Py_DECREF(unicode_empty);
12940 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012941 return out;
12942 }
12943 buf1 = PyUnicode_DATA(str_obj);
12944 buf2 = PyUnicode_DATA(sep_obj);
12945 if (kind2 != kind1) {
12946 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12947 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012948 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012953 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12954 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12955 else
12956 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 break;
12958 case PyUnicode_2BYTE_KIND:
12959 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12960 break;
12961 case PyUnicode_4BYTE_KIND:
12962 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963 break;
12964 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012965 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012968 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
12971 return out;
12972}
12973
INADA Naoki3ae20562017-01-16 20:41:20 +090012974/*[clinic input]
12975str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976
INADA Naoki3ae20562017-01-16 20:41:20 +090012977 sep: object
12978 /
12979
12980Partition the string into three parts using the given separator.
12981
12982This will search for the separator in the string. If the separator is found,
12983returns a 3-tuple containing the part before the separator, the separator
12984itself, and the part after it.
12985
12986If the separator is not found, returns a 3-tuple containing the original string
12987and two empty strings.
12988[clinic start generated code]*/
12989
12990static PyObject *
12991unicode_partition(PyObject *self, PyObject *sep)
12992/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993{
INADA Naoki3ae20562017-01-16 20:41:20 +090012994 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995}
12996
INADA Naoki3ae20562017-01-16 20:41:20 +090012997/*[clinic input]
12998str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999
INADA Naoki3ae20562017-01-16 20:41:20 +090013000Partition the string into three parts using the given separator.
13001
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013002This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013003the separator is found, returns a 3-tuple containing the part before the
13004separator, the separator itself, and the part after it.
13005
13006If the separator is not found, returns a 3-tuple containing two empty strings
13007and the original string.
13008[clinic start generated code]*/
13009
13010static PyObject *
13011unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013012/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013{
INADA Naoki3ae20562017-01-16 20:41:20 +090013014 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013015}
13016
Alexander Belopolsky40018472011-02-26 01:02:56 +000013017PyObject *
13018PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013019{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013020 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013021 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013022
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013023 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013024}
13025
INADA Naoki3ae20562017-01-16 20:41:20 +090013026/*[clinic input]
13027str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013028
INADA Naoki3ae20562017-01-16 20:41:20 +090013029Return a list of the words in the string, using sep as the delimiter string.
13030
13031Splits are done starting at the end of the string and working to the front.
13032[clinic start generated code]*/
13033
13034static PyObject *
13035unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13036/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013037{
INADA Naoki3ae20562017-01-16 20:41:20 +090013038 if (sep == Py_None)
13039 return rsplit(self, NULL, maxsplit);
13040 if (PyUnicode_Check(sep))
13041 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013042
Victor Stinner998b8062018-09-12 00:23:25 +020013043 PyErr_Format(PyExc_TypeError,
13044 "must be str or None, not %.100s",
13045 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013046 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013047}
13048
INADA Naoki3ae20562017-01-16 20:41:20 +090013049/*[clinic input]
13050str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013052 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013053
13054Return a list of the lines in the string, breaking at line boundaries.
13055
13056Line breaks are not included in the resulting list unless keepends is given and
13057true.
13058[clinic start generated code]*/
13059
13060static PyObject *
13061unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013062/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013064 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065}
13066
13067static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013068PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013070 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071}
13072
INADA Naoki3ae20562017-01-16 20:41:20 +090013073/*[clinic input]
13074str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
INADA Naoki3ae20562017-01-16 20:41:20 +090013076Convert uppercase characters to lowercase and lowercase characters to uppercase.
13077[clinic start generated code]*/
13078
13079static PyObject *
13080unicode_swapcase_impl(PyObject *self)
13081/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013083 if (PyUnicode_READY(self) == -1)
13084 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013085 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086}
13087
Larry Hastings61272b72014-01-07 12:41:53 -080013088/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013089
Larry Hastings31826802013-10-19 00:09:25 -070013090@staticmethod
13091str.maketrans as unicode_maketrans
13092
13093 x: object
13094
13095 y: unicode=NULL
13096
13097 z: unicode=NULL
13098
13099 /
13100
13101Return a translation table usable for str.translate().
13102
13103If there is only one argument, it must be a dictionary mapping Unicode
13104ordinals (integers) or characters to Unicode ordinals, strings or None.
13105Character keys will be then converted to ordinals.
13106If there are two arguments, they must be strings of equal length, and
13107in the resulting dictionary, each character in x will be mapped to the
13108character at the same position in y. If there is a third argument, it
13109must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013110[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013111
Larry Hastings31826802013-10-19 00:09:25 -070013112static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013113unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013114/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013115{
Georg Brandlceee0772007-11-27 23:48:05 +000013116 PyObject *new = NULL, *key, *value;
13117 Py_ssize_t i = 0;
13118 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119
Georg Brandlceee0772007-11-27 23:48:05 +000013120 new = PyDict_New();
13121 if (!new)
13122 return NULL;
13123 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 int x_kind, y_kind, z_kind;
13125 void *x_data, *y_data, *z_data;
13126
Georg Brandlceee0772007-11-27 23:48:05 +000013127 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013128 if (!PyUnicode_Check(x)) {
13129 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13130 "be a string if there is a second argument");
13131 goto err;
13132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013134 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13135 "arguments must have equal length");
13136 goto err;
13137 }
13138 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 x_kind = PyUnicode_KIND(x);
13140 y_kind = PyUnicode_KIND(y);
13141 x_data = PyUnicode_DATA(x);
13142 y_data = PyUnicode_DATA(y);
13143 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13144 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013145 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013146 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013147 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013148 if (!value) {
13149 Py_DECREF(key);
13150 goto err;
13151 }
Georg Brandlceee0772007-11-27 23:48:05 +000013152 res = PyDict_SetItem(new, key, value);
13153 Py_DECREF(key);
13154 Py_DECREF(value);
13155 if (res < 0)
13156 goto err;
13157 }
13158 /* create entries for deleting chars in z */
13159 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 z_kind = PyUnicode_KIND(z);
13161 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013162 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013164 if (!key)
13165 goto err;
13166 res = PyDict_SetItem(new, key, Py_None);
13167 Py_DECREF(key);
13168 if (res < 0)
13169 goto err;
13170 }
13171 }
13172 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 int kind;
13174 void *data;
13175
Georg Brandlceee0772007-11-27 23:48:05 +000013176 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013177 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013178 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13179 "to maketrans it must be a dict");
13180 goto err;
13181 }
13182 /* copy entries into the new dict, converting string keys to int keys */
13183 while (PyDict_Next(x, &i, &key, &value)) {
13184 if (PyUnicode_Check(key)) {
13185 /* convert string keys to integer keys */
13186 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013187 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013188 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13189 "table must be of length 1");
13190 goto err;
13191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 kind = PyUnicode_KIND(key);
13193 data = PyUnicode_DATA(key);
13194 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013195 if (!newkey)
13196 goto err;
13197 res = PyDict_SetItem(new, newkey, value);
13198 Py_DECREF(newkey);
13199 if (res < 0)
13200 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013201 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013202 /* just keep integer keys */
13203 if (PyDict_SetItem(new, key, value) < 0)
13204 goto err;
13205 } else {
13206 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13207 "be strings or integers");
13208 goto err;
13209 }
13210 }
13211 }
13212 return new;
13213 err:
13214 Py_DECREF(new);
13215 return NULL;
13216}
13217
INADA Naoki3ae20562017-01-16 20:41:20 +090013218/*[clinic input]
13219str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220
INADA Naoki3ae20562017-01-16 20:41:20 +090013221 table: object
13222 Translation table, which must be a mapping of Unicode ordinals to
13223 Unicode ordinals, strings, or None.
13224 /
13225
13226Replace each character in the string using the given translation table.
13227
13228The table must implement lookup/indexing via __getitem__, for instance a
13229dictionary or list. If this operation raises LookupError, the character is
13230left untouched. Characters mapped to None are deleted.
13231[clinic start generated code]*/
13232
13233static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013235/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238}
13239
INADA Naoki3ae20562017-01-16 20:41:20 +090013240/*[clinic input]
13241str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242
INADA Naoki3ae20562017-01-16 20:41:20 +090013243Return a copy of the string converted to uppercase.
13244[clinic start generated code]*/
13245
13246static PyObject *
13247unicode_upper_impl(PyObject *self)
13248/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013250 if (PyUnicode_READY(self) == -1)
13251 return NULL;
13252 if (PyUnicode_IS_ASCII(self))
13253 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013254 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255}
13256
INADA Naoki3ae20562017-01-16 20:41:20 +090013257/*[clinic input]
13258str.zfill as unicode_zfill
13259
13260 width: Py_ssize_t
13261 /
13262
13263Pad a numeric string with zeros on the left, to fill a field of the given width.
13264
13265The string is never truncated.
13266[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
13268static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013269unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013270/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013272 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013273 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 int kind;
13275 void *data;
13276 Py_UCS4 chr;
13277
Benjamin Petersonbac79492012-01-14 13:34:47 -050013278 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280
Victor Stinnerc4b49542011-12-11 22:44:26 +010013281 if (PyUnicode_GET_LENGTH(self) >= width)
13282 return unicode_result_unchanged(self);
13283
13284 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285
13286 u = pad(self, fill, 0, '0');
13287
Walter Dörwald068325e2002-04-15 13:36:47 +000013288 if (u == NULL)
13289 return NULL;
13290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 kind = PyUnicode_KIND(u);
13292 data = PyUnicode_DATA(u);
13293 chr = PyUnicode_READ(kind, data, fill);
13294
13295 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 PyUnicode_WRITE(kind, data, 0, chr);
13298 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299 }
13300
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013301 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013302 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304
13305#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013306static PyObject *
13307unicode__decimal2ascii(PyObject *self)
13308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013310}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311#endif
13312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013313PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013316Return True if S starts with the specified prefix, False otherwise.\n\
13317With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013318With optional end, stop comparing S at that position.\n\
13319prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320
13321static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013322unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013325 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013326 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013327 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013328 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330
Jesus Ceaac451502011-04-20 17:09:23 +020013331 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013333 if (PyTuple_Check(subobj)) {
13334 Py_ssize_t i;
13335 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013336 substring = PyTuple_GET_ITEM(subobj, i);
13337 if (!PyUnicode_Check(substring)) {
13338 PyErr_Format(PyExc_TypeError,
13339 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013340 "not %.100s",
13341 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013342 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013343 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013344 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013345 if (result == -1)
13346 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013347 if (result) {
13348 Py_RETURN_TRUE;
13349 }
13350 }
13351 /* nothing matched */
13352 Py_RETURN_FALSE;
13353 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013354 if (!PyUnicode_Check(subobj)) {
13355 PyErr_Format(PyExc_TypeError,
13356 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013357 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013359 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013360 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013361 if (result == -1)
13362 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013363 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364}
13365
13366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013367PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013370Return True if S ends with the specified suffix, False otherwise.\n\
13371With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013372With optional end, stop comparing S at that position.\n\
13373suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374
13375static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013376unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013380 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013381 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013382 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384
Jesus Ceaac451502011-04-20 17:09:23 +020013385 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 if (PyTuple_Check(subobj)) {
13388 Py_ssize_t i;
13389 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013390 substring = PyTuple_GET_ITEM(subobj, i);
13391 if (!PyUnicode_Check(substring)) {
13392 PyErr_Format(PyExc_TypeError,
13393 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013394 "not %.100s",
13395 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013398 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013399 if (result == -1)
13400 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 if (result) {
13402 Py_RETURN_TRUE;
13403 }
13404 }
13405 Py_RETURN_FALSE;
13406 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013407 if (!PyUnicode_Check(subobj)) {
13408 PyErr_Format(PyExc_TypeError,
13409 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013410 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013412 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013413 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013414 if (result == -1)
13415 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013416 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417}
13418
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013419static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013420_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013421{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013422 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13423 writer->data = PyUnicode_DATA(writer->buffer);
13424
13425 if (!writer->readonly) {
13426 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013427 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013428 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013429 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013430 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13431 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13432 writer->kind = PyUnicode_WCHAR_KIND;
13433 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13434
Victor Stinner8f674cc2013-04-17 23:02:17 +020013435 /* Copy-on-write mode: set buffer size to 0 so
13436 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13437 * next write. */
13438 writer->size = 0;
13439 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013440}
13441
Victor Stinnerd3f08822012-05-29 12:57:52 +020013442void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013443_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013444{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013445 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013446
13447 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013448 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013449
13450 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13451 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13452 writer->kind = PyUnicode_WCHAR_KIND;
13453 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013454}
13455
Victor Stinnerd3f08822012-05-29 12:57:52 +020013456int
13457_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13458 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013459{
13460 Py_ssize_t newlen;
13461 PyObject *newbuffer;
13462
Victor Stinner2740e462016-09-06 16:58:36 -070013463 assert(maxchar <= MAX_UNICODE);
13464
Victor Stinnerca9381e2015-09-22 00:58:32 +020013465 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013466 assert((maxchar > writer->maxchar && length >= 0)
13467 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013468
Victor Stinner202fdca2012-05-07 12:47:02 +020013469 if (length > PY_SSIZE_T_MAX - writer->pos) {
13470 PyErr_NoMemory();
13471 return -1;
13472 }
13473 newlen = writer->pos + length;
13474
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013475 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013476
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013478 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013479 if (writer->overallocate
13480 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13481 /* overallocate to limit the number of realloc() */
13482 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013484 if (newlen < writer->min_length)
13485 newlen = writer->min_length;
13486
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 writer->buffer = PyUnicode_New(newlen, maxchar);
13488 if (writer->buffer == NULL)
13489 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013490 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013491 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013492 if (writer->overallocate
13493 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13494 /* overallocate to limit the number of realloc() */
13495 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497 if (newlen < writer->min_length)
13498 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013499
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013500 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013501 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013502 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013503 newbuffer = PyUnicode_New(newlen, maxchar);
13504 if (newbuffer == NULL)
13505 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13507 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013508 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013509 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013510 }
13511 else {
13512 newbuffer = resize_compact(writer->buffer, newlen);
13513 if (newbuffer == NULL)
13514 return -1;
13515 }
13516 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013517 }
13518 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013519 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013520 newbuffer = PyUnicode_New(writer->size, maxchar);
13521 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013522 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13524 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013525 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013526 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013527 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013528 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013529
13530#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013531}
13532
Victor Stinnerca9381e2015-09-22 00:58:32 +020013533int
13534_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13535 enum PyUnicode_Kind kind)
13536{
13537 Py_UCS4 maxchar;
13538
13539 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13540 assert(writer->kind < kind);
13541
13542 switch (kind)
13543 {
13544 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13545 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13546 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13547 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013548 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013549 }
13550
13551 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13552}
13553
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013554static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013555_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013556{
Victor Stinner2740e462016-09-06 16:58:36 -070013557 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013558 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13559 return -1;
13560 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13561 writer->pos++;
13562 return 0;
13563}
13564
13565int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013566_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13567{
13568 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13569}
13570
13571int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13573{
13574 Py_UCS4 maxchar;
13575 Py_ssize_t len;
13576
13577 if (PyUnicode_READY(str) == -1)
13578 return -1;
13579 len = PyUnicode_GET_LENGTH(str);
13580 if (len == 0)
13581 return 0;
13582 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13583 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013584 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013585 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013586 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 Py_INCREF(str);
13588 writer->buffer = str;
13589 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 writer->pos += len;
13591 return 0;
13592 }
13593 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13594 return -1;
13595 }
13596 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13597 str, 0, len);
13598 writer->pos += len;
13599 return 0;
13600}
13601
Victor Stinnere215d962012-10-06 23:03:36 +020013602int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013603_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13604 Py_ssize_t start, Py_ssize_t end)
13605{
13606 Py_UCS4 maxchar;
13607 Py_ssize_t len;
13608
13609 if (PyUnicode_READY(str) == -1)
13610 return -1;
13611
13612 assert(0 <= start);
13613 assert(end <= PyUnicode_GET_LENGTH(str));
13614 assert(start <= end);
13615
13616 if (end == 0)
13617 return 0;
13618
13619 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13620 return _PyUnicodeWriter_WriteStr(writer, str);
13621
13622 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13623 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13624 else
13625 maxchar = writer->maxchar;
13626 len = end - start;
13627
13628 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13629 return -1;
13630
13631 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13632 str, start, len);
13633 writer->pos += len;
13634 return 0;
13635}
13636
13637int
Victor Stinner4a587072013-11-19 12:54:53 +010013638_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13639 const char *ascii, Py_ssize_t len)
13640{
13641 if (len == -1)
13642 len = strlen(ascii);
13643
13644 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13645
13646 if (writer->buffer == NULL && !writer->overallocate) {
13647 PyObject *str;
13648
13649 str = _PyUnicode_FromASCII(ascii, len);
13650 if (str == NULL)
13651 return -1;
13652
13653 writer->readonly = 1;
13654 writer->buffer = str;
13655 _PyUnicodeWriter_Update(writer);
13656 writer->pos += len;
13657 return 0;
13658 }
13659
13660 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13661 return -1;
13662
13663 switch (writer->kind)
13664 {
13665 case PyUnicode_1BYTE_KIND:
13666 {
13667 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13668 Py_UCS1 *data = writer->data;
13669
Christian Heimesf051e432016-09-13 20:22:02 +020013670 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013671 break;
13672 }
13673 case PyUnicode_2BYTE_KIND:
13674 {
13675 _PyUnicode_CONVERT_BYTES(
13676 Py_UCS1, Py_UCS2,
13677 ascii, ascii + len,
13678 (Py_UCS2 *)writer->data + writer->pos);
13679 break;
13680 }
13681 case PyUnicode_4BYTE_KIND:
13682 {
13683 _PyUnicode_CONVERT_BYTES(
13684 Py_UCS1, Py_UCS4,
13685 ascii, ascii + len,
13686 (Py_UCS4 *)writer->data + writer->pos);
13687 break;
13688 }
13689 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013690 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013691 }
13692
13693 writer->pos += len;
13694 return 0;
13695}
13696
13697int
13698_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13699 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013700{
13701 Py_UCS4 maxchar;
13702
13703 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13704 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13705 return -1;
13706 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13707 writer->pos += len;
13708 return 0;
13709}
13710
Victor Stinnerd3f08822012-05-29 12:57:52 +020013711PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013712_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013713{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013714 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013715
Victor Stinnerd3f08822012-05-29 12:57:52 +020013716 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013717 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013718 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013719 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013720
13721 str = writer->buffer;
13722 writer->buffer = NULL;
13723
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013724 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013725 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13726 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013727 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013728
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013729 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13730 PyObject *str2;
13731 str2 = resize_compact(str, writer->pos);
13732 if (str2 == NULL) {
13733 Py_DECREF(str);
13734 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013735 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013736 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013737 }
13738
Victor Stinner15a0bd32013-07-08 22:29:55 +020013739 assert(_PyUnicode_CheckConsistency(str, 1));
13740 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013741}
13742
Victor Stinnerd3f08822012-05-29 12:57:52 +020013743void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013744_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013745{
13746 Py_CLEAR(writer->buffer);
13747}
13748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013749#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013750
13751PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013753\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013754Return a formatted version of S, using substitutions from args and kwargs.\n\
13755The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013756
Eric Smith27bbca62010-11-04 17:06:58 +000013757PyDoc_STRVAR(format_map__doc__,
13758 "S.format_map(mapping) -> str\n\
13759\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013760Return a formatted version of S, using substitutions from mapping.\n\
13761The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013762
INADA Naoki3ae20562017-01-16 20:41:20 +090013763/*[clinic input]
13764str.__format__ as unicode___format__
13765
13766 format_spec: unicode
13767 /
13768
13769Return a formatted version of the string as described by format_spec.
13770[clinic start generated code]*/
13771
Eric Smith4a7d76d2008-05-30 18:10:19 +000013772static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013773unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013774/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013775{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013776 _PyUnicodeWriter writer;
13777 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013778
Victor Stinnerd3f08822012-05-29 12:57:52 +020013779 if (PyUnicode_READY(self) == -1)
13780 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013781 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013782 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13783 self, format_spec, 0,
13784 PyUnicode_GET_LENGTH(format_spec));
13785 if (ret == -1) {
13786 _PyUnicodeWriter_Dealloc(&writer);
13787 return NULL;
13788 }
13789 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013790}
13791
INADA Naoki3ae20562017-01-16 20:41:20 +090013792/*[clinic input]
13793str.__sizeof__ as unicode_sizeof
13794
13795Return the size of the string in memory, in bytes.
13796[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013797
13798static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013799unicode_sizeof_impl(PyObject *self)
13800/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802 Py_ssize_t size;
13803
13804 /* If it's a compact object, account for base structure +
13805 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013806 if (PyUnicode_IS_COMPACT_ASCII(self))
13807 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13808 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013810 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 else {
13812 /* If it is a two-block object, account for base object, and
13813 for character block if present. */
13814 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013815 if (_PyUnicode_DATA_ANY(self))
13816 size += (PyUnicode_GET_LENGTH(self) + 1) *
13817 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013818 }
13819 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013820 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013821 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13822 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13823 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13824 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825
13826 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013827}
13828
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013829static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013830unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013831{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013832 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 if (!copy)
13834 return NULL;
13835 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013836}
13837
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013839 UNICODE_ENCODE_METHODDEF
13840 UNICODE_REPLACE_METHODDEF
13841 UNICODE_SPLIT_METHODDEF
13842 UNICODE_RSPLIT_METHODDEF
13843 UNICODE_JOIN_METHODDEF
13844 UNICODE_CAPITALIZE_METHODDEF
13845 UNICODE_CASEFOLD_METHODDEF
13846 UNICODE_TITLE_METHODDEF
13847 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013848 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013850 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013851 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013852 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013853 UNICODE_LJUST_METHODDEF
13854 UNICODE_LOWER_METHODDEF
13855 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013856 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13857 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013858 UNICODE_RJUST_METHODDEF
13859 UNICODE_RSTRIP_METHODDEF
13860 UNICODE_RPARTITION_METHODDEF
13861 UNICODE_SPLITLINES_METHODDEF
13862 UNICODE_STRIP_METHODDEF
13863 UNICODE_SWAPCASE_METHODDEF
13864 UNICODE_TRANSLATE_METHODDEF
13865 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013866 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13867 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013868 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013869 UNICODE_ISLOWER_METHODDEF
13870 UNICODE_ISUPPER_METHODDEF
13871 UNICODE_ISTITLE_METHODDEF
13872 UNICODE_ISSPACE_METHODDEF
13873 UNICODE_ISDECIMAL_METHODDEF
13874 UNICODE_ISDIGIT_METHODDEF
13875 UNICODE_ISNUMERIC_METHODDEF
13876 UNICODE_ISALPHA_METHODDEF
13877 UNICODE_ISALNUM_METHODDEF
13878 UNICODE_ISIDENTIFIER_METHODDEF
13879 UNICODE_ISPRINTABLE_METHODDEF
13880 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013881 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013882 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013884 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013885 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013886#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013887 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013888 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889#endif
13890
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013891 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892 {NULL, NULL}
13893};
13894
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013895static PyObject *
13896unicode_mod(PyObject *v, PyObject *w)
13897{
Brian Curtindfc80e32011-08-10 20:28:54 -050013898 if (!PyUnicode_Check(v))
13899 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013901}
13902
13903static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 0, /*nb_add*/
13905 0, /*nb_subtract*/
13906 0, /*nb_multiply*/
13907 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013908};
13909
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013911 (lenfunc) unicode_length, /* sq_length */
13912 PyUnicode_Concat, /* sq_concat */
13913 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13914 (ssizeargfunc) unicode_getitem, /* sq_item */
13915 0, /* sq_slice */
13916 0, /* sq_ass_item */
13917 0, /* sq_ass_slice */
13918 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919};
13920
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013921static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013922unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013924 if (PyUnicode_READY(self) == -1)
13925 return NULL;
13926
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013927 if (PyIndex_Check(item)) {
13928 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013929 if (i == -1 && PyErr_Occurred())
13930 return NULL;
13931 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013932 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013933 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013934 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013935 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013936 PyObject *result;
13937 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013938 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013939 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013940
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013941 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013942 return NULL;
13943 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013944 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13945 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013946
13947 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013948 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013950 slicelength == PyUnicode_GET_LENGTH(self)) {
13951 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013952 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013953 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013954 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013955 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013956 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013957 src_kind = PyUnicode_KIND(self);
13958 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013959 if (!PyUnicode_IS_ASCII(self)) {
13960 kind_limit = kind_maxchar_limit(src_kind);
13961 max_char = 0;
13962 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13963 ch = PyUnicode_READ(src_kind, src_data, cur);
13964 if (ch > max_char) {
13965 max_char = ch;
13966 if (max_char >= kind_limit)
13967 break;
13968 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013969 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013970 }
Victor Stinner55c99112011-10-13 01:17:06 +020013971 else
13972 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013973 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013974 if (result == NULL)
13975 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013976 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013977 dest_data = PyUnicode_DATA(result);
13978
13979 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013980 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13981 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013982 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013983 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013984 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013985 } else {
13986 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13987 return NULL;
13988 }
13989}
13990
13991static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 (lenfunc)unicode_length, /* mp_length */
13993 (binaryfunc)unicode_subscript, /* mp_subscript */
13994 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013995};
13996
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997
Guido van Rossumd57fd912000-03-10 22:53:23 +000013998/* Helpers for PyUnicode_Format() */
13999
Victor Stinnera47082312012-10-04 02:19:54 +020014000struct unicode_formatter_t {
14001 PyObject *args;
14002 int args_owned;
14003 Py_ssize_t arglen, argidx;
14004 PyObject *dict;
14005
14006 enum PyUnicode_Kind fmtkind;
14007 Py_ssize_t fmtcnt, fmtpos;
14008 void *fmtdata;
14009 PyObject *fmtstr;
14010
14011 _PyUnicodeWriter writer;
14012};
14013
14014struct unicode_format_arg_t {
14015 Py_UCS4 ch;
14016 int flags;
14017 Py_ssize_t width;
14018 int prec;
14019 int sign;
14020};
14021
Guido van Rossumd57fd912000-03-10 22:53:23 +000014022static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014023unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014024{
Victor Stinnera47082312012-10-04 02:19:54 +020014025 Py_ssize_t argidx = ctx->argidx;
14026
14027 if (argidx < ctx->arglen) {
14028 ctx->argidx++;
14029 if (ctx->arglen < 0)
14030 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 else
Victor Stinnera47082312012-10-04 02:19:54 +020014032 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033 }
14034 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014035 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036 return NULL;
14037}
14038
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014039/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014040
Victor Stinnera47082312012-10-04 02:19:54 +020014041/* Format a float into the writer if the writer is not NULL, or into *p_output
14042 otherwise.
14043
14044 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014045static int
Victor Stinnera47082312012-10-04 02:19:54 +020014046formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14047 PyObject **p_output,
14048 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014049{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014050 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014051 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014052 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014053 int prec;
14054 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014055
Guido van Rossumd57fd912000-03-10 22:53:23 +000014056 x = PyFloat_AsDouble(v);
14057 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014059
Victor Stinnera47082312012-10-04 02:19:54 +020014060 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014062 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014063
Victor Stinnera47082312012-10-04 02:19:54 +020014064 if (arg->flags & F_ALT)
14065 dtoa_flags = Py_DTSF_ALT;
14066 else
14067 dtoa_flags = 0;
14068 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014069 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014070 return -1;
14071 len = strlen(p);
14072 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014073 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014074 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014075 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014076 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014077 }
14078 else
14079 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014080 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014081 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014082}
14083
Victor Stinnerd0880d52012-04-27 23:40:13 +020014084/* formatlong() emulates the format codes d, u, o, x and X, and
14085 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14086 * Python's regular ints.
14087 * Return value: a new PyUnicodeObject*, or NULL if error.
14088 * The output string is of the form
14089 * "-"? ("0x" | "0X")? digit+
14090 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14091 * set in flags. The case of hex digits will be correct,
14092 * There will be at least prec digits, zero-filled on the left if
14093 * necessary to get that many.
14094 * val object to be converted
14095 * flags bitmask of format flags; only F_ALT is looked at
14096 * prec minimum number of digits; 0-fill on left if needed
14097 * type a character in [duoxX]; u acts the same as d
14098 *
14099 * CAUTION: o, x and X conversions on regular ints can never
14100 * produce a '-' sign, but can for Python's unbounded ints.
14101 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014102PyObject *
14103_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014104{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014107 Py_ssize_t i;
14108 int sign; /* 1 if '-', else 0 */
14109 int len; /* number of characters */
14110 Py_ssize_t llen;
14111 int numdigits; /* len == numnondigits + numdigits */
14112 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014113
Victor Stinnerd0880d52012-04-27 23:40:13 +020014114 /* Avoid exceeding SSIZE_T_MAX */
14115 if (prec > INT_MAX-3) {
14116 PyErr_SetString(PyExc_OverflowError,
14117 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014118 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014119 }
14120
14121 assert(PyLong_Check(val));
14122
14123 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014124 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014125 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014126 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014127 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014128 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014129 /* int and int subclasses should print numerically when a numeric */
14130 /* format code is used (see issue18780) */
14131 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014132 break;
14133 case 'o':
14134 numnondigits = 2;
14135 result = PyNumber_ToBase(val, 8);
14136 break;
14137 case 'x':
14138 case 'X':
14139 numnondigits = 2;
14140 result = PyNumber_ToBase(val, 16);
14141 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014142 }
14143 if (!result)
14144 return NULL;
14145
14146 assert(unicode_modifiable(result));
14147 assert(PyUnicode_IS_READY(result));
14148 assert(PyUnicode_IS_ASCII(result));
14149
14150 /* To modify the string in-place, there can only be one reference. */
14151 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014152 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014153 PyErr_BadInternalCall();
14154 return NULL;
14155 }
14156 buf = PyUnicode_DATA(result);
14157 llen = PyUnicode_GET_LENGTH(result);
14158 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014159 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014161 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014162 return NULL;
14163 }
14164 len = (int)llen;
14165 sign = buf[0] == '-';
14166 numnondigits += sign;
14167 numdigits = len - numnondigits;
14168 assert(numdigits > 0);
14169
14170 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014171 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 (type == 'o' || type == 'x' || type == 'X'))) {
14173 assert(buf[sign] == '0');
14174 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14175 buf[sign+1] == 'o');
14176 numnondigits -= 2;
14177 buf += 2;
14178 len -= 2;
14179 if (sign)
14180 buf[0] = '-';
14181 assert(len == numnondigits + numdigits);
14182 assert(numdigits > 0);
14183 }
14184
14185 /* Fill with leading zeroes to meet minimum width. */
14186 if (prec > numdigits) {
14187 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14188 numnondigits + prec);
14189 char *b1;
14190 if (!r1) {
14191 Py_DECREF(result);
14192 return NULL;
14193 }
14194 b1 = PyBytes_AS_STRING(r1);
14195 for (i = 0; i < numnondigits; ++i)
14196 *b1++ = *buf++;
14197 for (i = 0; i < prec - numdigits; i++)
14198 *b1++ = '0';
14199 for (i = 0; i < numdigits; i++)
14200 *b1++ = *buf++;
14201 *b1 = '\0';
14202 Py_DECREF(result);
14203 result = r1;
14204 buf = PyBytes_AS_STRING(result);
14205 len = numnondigits + prec;
14206 }
14207
14208 /* Fix up case for hex conversions. */
14209 if (type == 'X') {
14210 /* Need to convert all lower case letters to upper case.
14211 and need to convert 0x to 0X (and -0x to -0X). */
14212 for (i = 0; i < len; i++)
14213 if (buf[i] >= 'a' && buf[i] <= 'x')
14214 buf[i] -= 'a'-'A';
14215 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014216 if (!PyUnicode_Check(result)
14217 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014218 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014219 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014220 Py_DECREF(result);
14221 result = unicode;
14222 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014223 else if (len != PyUnicode_GET_LENGTH(result)) {
14224 if (PyUnicode_Resize(&result, len) < 0)
14225 Py_CLEAR(result);
14226 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014227 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014228}
14229
Ethan Furmandf3ed242014-01-05 06:50:30 -080014230/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014231 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014232 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014233 * -1 and raise an exception on error */
14234static int
Victor Stinnera47082312012-10-04 02:19:54 +020014235mainformatlong(PyObject *v,
14236 struct unicode_format_arg_t *arg,
14237 PyObject **p_output,
14238 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014239{
14240 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014241 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014242
14243 if (!PyNumber_Check(v))
14244 goto wrongtype;
14245
Ethan Furman9ab74802014-03-21 06:38:46 -070014246 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014247 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014248 if (type == 'o' || type == 'x' || type == 'X') {
14249 iobj = PyNumber_Index(v);
14250 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014251 if (PyErr_ExceptionMatches(PyExc_TypeError))
14252 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014253 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014254 }
14255 }
14256 else {
14257 iobj = PyNumber_Long(v);
14258 if (iobj == NULL ) {
14259 if (PyErr_ExceptionMatches(PyExc_TypeError))
14260 goto wrongtype;
14261 return -1;
14262 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014263 }
14264 assert(PyLong_Check(iobj));
14265 }
14266 else {
14267 iobj = v;
14268 Py_INCREF(iobj);
14269 }
14270
14271 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014272 && arg->width == -1 && arg->prec == -1
14273 && !(arg->flags & (F_SIGN | F_BLANK))
14274 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014275 {
14276 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014277 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278 int base;
14279
Victor Stinnera47082312012-10-04 02:19:54 +020014280 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 {
14282 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014283 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 case 'd':
14285 case 'i':
14286 case 'u':
14287 base = 10;
14288 break;
14289 case 'o':
14290 base = 8;
14291 break;
14292 case 'x':
14293 case 'X':
14294 base = 16;
14295 break;
14296 }
14297
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014298 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14299 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014301 }
14302 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014303 return 1;
14304 }
14305
Ethan Furmanb95b5612015-01-23 20:05:18 -080014306 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 Py_DECREF(iobj);
14308 if (res == NULL)
14309 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014310 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014311 return 0;
14312
14313wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014314 switch(type)
14315 {
14316 case 'o':
14317 case 'x':
14318 case 'X':
14319 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014320 "%%%c format: an integer is required, "
14321 "not %.200s",
14322 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014323 break;
14324 default:
14325 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014326 "%%%c format: a number is required, "
14327 "not %.200s",
14328 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014329 break;
14330 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014331 return -1;
14332}
14333
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014334static Py_UCS4
14335formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014336{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014337 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014338 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014339 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014340 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014341 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014342 goto onError;
14343 }
14344 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014345 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014346 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014347 /* make sure number is a type of integer */
14348 if (!PyLong_Check(v)) {
14349 iobj = PyNumber_Index(v);
14350 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014351 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014352 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014353 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014354 Py_DECREF(iobj);
14355 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014356 else {
14357 x = PyLong_AsLong(v);
14358 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014359 if (x == -1 && PyErr_Occurred())
14360 goto onError;
14361
Victor Stinner8faf8212011-12-08 22:14:11 +010014362 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014363 PyErr_SetString(PyExc_OverflowError,
14364 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014365 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014366 }
14367
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014368 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014370
Benjamin Peterson29060642009-01-31 22:14:21 +000014371 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014372 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014373 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014374 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375}
14376
Victor Stinnera47082312012-10-04 02:19:54 +020014377/* Parse options of an argument: flags, width, precision.
14378 Handle also "%(name)" syntax.
14379
14380 Return 0 if the argument has been formatted into arg->str.
14381 Return 1 if the argument has been written into ctx->writer,
14382 Raise an exception and return -1 on error. */
14383static int
14384unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14385 struct unicode_format_arg_t *arg)
14386{
14387#define FORMAT_READ(ctx) \
14388 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14389
14390 PyObject *v;
14391
Victor Stinnera47082312012-10-04 02:19:54 +020014392 if (arg->ch == '(') {
14393 /* Get argument value from a dictionary. Example: "%(name)s". */
14394 Py_ssize_t keystart;
14395 Py_ssize_t keylen;
14396 PyObject *key;
14397 int pcount = 1;
14398
14399 if (ctx->dict == NULL) {
14400 PyErr_SetString(PyExc_TypeError,
14401 "format requires a mapping");
14402 return -1;
14403 }
14404 ++ctx->fmtpos;
14405 --ctx->fmtcnt;
14406 keystart = ctx->fmtpos;
14407 /* Skip over balanced parentheses */
14408 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14409 arg->ch = FORMAT_READ(ctx);
14410 if (arg->ch == ')')
14411 --pcount;
14412 else if (arg->ch == '(')
14413 ++pcount;
14414 ctx->fmtpos++;
14415 }
14416 keylen = ctx->fmtpos - keystart - 1;
14417 if (ctx->fmtcnt < 0 || pcount > 0) {
14418 PyErr_SetString(PyExc_ValueError,
14419 "incomplete format key");
14420 return -1;
14421 }
14422 key = PyUnicode_Substring(ctx->fmtstr,
14423 keystart, keystart + keylen);
14424 if (key == NULL)
14425 return -1;
14426 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014427 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014428 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014429 }
14430 ctx->args = PyObject_GetItem(ctx->dict, key);
14431 Py_DECREF(key);
14432 if (ctx->args == NULL)
14433 return -1;
14434 ctx->args_owned = 1;
14435 ctx->arglen = -1;
14436 ctx->argidx = -2;
14437 }
14438
14439 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014440 while (--ctx->fmtcnt >= 0) {
14441 arg->ch = FORMAT_READ(ctx);
14442 ctx->fmtpos++;
14443 switch (arg->ch) {
14444 case '-': arg->flags |= F_LJUST; continue;
14445 case '+': arg->flags |= F_SIGN; continue;
14446 case ' ': arg->flags |= F_BLANK; continue;
14447 case '#': arg->flags |= F_ALT; continue;
14448 case '0': arg->flags |= F_ZERO; continue;
14449 }
14450 break;
14451 }
14452
14453 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014454 if (arg->ch == '*') {
14455 v = unicode_format_getnextarg(ctx);
14456 if (v == NULL)
14457 return -1;
14458 if (!PyLong_Check(v)) {
14459 PyErr_SetString(PyExc_TypeError,
14460 "* wants int");
14461 return -1;
14462 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014463 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014464 if (arg->width == -1 && PyErr_Occurred())
14465 return -1;
14466 if (arg->width < 0) {
14467 arg->flags |= F_LJUST;
14468 arg->width = -arg->width;
14469 }
14470 if (--ctx->fmtcnt >= 0) {
14471 arg->ch = FORMAT_READ(ctx);
14472 ctx->fmtpos++;
14473 }
14474 }
14475 else if (arg->ch >= '0' && arg->ch <= '9') {
14476 arg->width = arg->ch - '0';
14477 while (--ctx->fmtcnt >= 0) {
14478 arg->ch = FORMAT_READ(ctx);
14479 ctx->fmtpos++;
14480 if (arg->ch < '0' || arg->ch > '9')
14481 break;
14482 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14483 mixing signed and unsigned comparison. Since arg->ch is between
14484 '0' and '9', casting to int is safe. */
14485 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14486 PyErr_SetString(PyExc_ValueError,
14487 "width too big");
14488 return -1;
14489 }
14490 arg->width = arg->width*10 + (arg->ch - '0');
14491 }
14492 }
14493
14494 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014495 if (arg->ch == '.') {
14496 arg->prec = 0;
14497 if (--ctx->fmtcnt >= 0) {
14498 arg->ch = FORMAT_READ(ctx);
14499 ctx->fmtpos++;
14500 }
14501 if (arg->ch == '*') {
14502 v = unicode_format_getnextarg(ctx);
14503 if (v == NULL)
14504 return -1;
14505 if (!PyLong_Check(v)) {
14506 PyErr_SetString(PyExc_TypeError,
14507 "* wants int");
14508 return -1;
14509 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014510 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014511 if (arg->prec == -1 && PyErr_Occurred())
14512 return -1;
14513 if (arg->prec < 0)
14514 arg->prec = 0;
14515 if (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 }
14519 }
14520 else if (arg->ch >= '0' && arg->ch <= '9') {
14521 arg->prec = arg->ch - '0';
14522 while (--ctx->fmtcnt >= 0) {
14523 arg->ch = FORMAT_READ(ctx);
14524 ctx->fmtpos++;
14525 if (arg->ch < '0' || arg->ch > '9')
14526 break;
14527 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14528 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014529 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014530 return -1;
14531 }
14532 arg->prec = arg->prec*10 + (arg->ch - '0');
14533 }
14534 }
14535 }
14536
14537 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14538 if (ctx->fmtcnt >= 0) {
14539 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14540 if (--ctx->fmtcnt >= 0) {
14541 arg->ch = FORMAT_READ(ctx);
14542 ctx->fmtpos++;
14543 }
14544 }
14545 }
14546 if (ctx->fmtcnt < 0) {
14547 PyErr_SetString(PyExc_ValueError,
14548 "incomplete format");
14549 return -1;
14550 }
14551 return 0;
14552
14553#undef FORMAT_READ
14554}
14555
14556/* Format one argument. Supported conversion specifiers:
14557
14558 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014559 - "i", "d", "u": int or float
14560 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014561 - "e", "E", "f", "F", "g", "G": float
14562 - "c": int or str (1 character)
14563
Victor Stinner8dbd4212012-12-04 09:30:24 +010014564 When possible, the output is written directly into the Unicode writer
14565 (ctx->writer). A string is created when padding is required.
14566
Victor Stinnera47082312012-10-04 02:19:54 +020014567 Return 0 if the argument has been formatted into *p_str,
14568 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014569 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014570static int
14571unicode_format_arg_format(struct unicode_formatter_t *ctx,
14572 struct unicode_format_arg_t *arg,
14573 PyObject **p_str)
14574{
14575 PyObject *v;
14576 _PyUnicodeWriter *writer = &ctx->writer;
14577
14578 if (ctx->fmtcnt == 0)
14579 ctx->writer.overallocate = 0;
14580
Victor Stinnera47082312012-10-04 02:19:54 +020014581 v = unicode_format_getnextarg(ctx);
14582 if (v == NULL)
14583 return -1;
14584
Victor Stinnera47082312012-10-04 02:19:54 +020014585
14586 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014587 case 's':
14588 case 'r':
14589 case 'a':
14590 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14591 /* Fast path */
14592 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14593 return -1;
14594 return 1;
14595 }
14596
14597 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14598 *p_str = v;
14599 Py_INCREF(*p_str);
14600 }
14601 else {
14602 if (arg->ch == 's')
14603 *p_str = PyObject_Str(v);
14604 else if (arg->ch == 'r')
14605 *p_str = PyObject_Repr(v);
14606 else
14607 *p_str = PyObject_ASCII(v);
14608 }
14609 break;
14610
14611 case 'i':
14612 case 'd':
14613 case 'u':
14614 case 'o':
14615 case 'x':
14616 case 'X':
14617 {
14618 int ret = mainformatlong(v, arg, p_str, writer);
14619 if (ret != 0)
14620 return ret;
14621 arg->sign = 1;
14622 break;
14623 }
14624
14625 case 'e':
14626 case 'E':
14627 case 'f':
14628 case 'F':
14629 case 'g':
14630 case 'G':
14631 if (arg->width == -1 && arg->prec == -1
14632 && !(arg->flags & (F_SIGN | F_BLANK)))
14633 {
14634 /* Fast path */
14635 if (formatfloat(v, arg, NULL, writer) == -1)
14636 return -1;
14637 return 1;
14638 }
14639
14640 arg->sign = 1;
14641 if (formatfloat(v, arg, p_str, NULL) == -1)
14642 return -1;
14643 break;
14644
14645 case 'c':
14646 {
14647 Py_UCS4 ch = formatchar(v);
14648 if (ch == (Py_UCS4) -1)
14649 return -1;
14650 if (arg->width == -1 && arg->prec == -1) {
14651 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014652 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014653 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014654 return 1;
14655 }
14656 *p_str = PyUnicode_FromOrdinal(ch);
14657 break;
14658 }
14659
14660 default:
14661 PyErr_Format(PyExc_ValueError,
14662 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014663 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014664 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14665 (int)arg->ch,
14666 ctx->fmtpos - 1);
14667 return -1;
14668 }
14669 if (*p_str == NULL)
14670 return -1;
14671 assert (PyUnicode_Check(*p_str));
14672 return 0;
14673}
14674
14675static int
14676unicode_format_arg_output(struct unicode_formatter_t *ctx,
14677 struct unicode_format_arg_t *arg,
14678 PyObject *str)
14679{
14680 Py_ssize_t len;
14681 enum PyUnicode_Kind kind;
14682 void *pbuf;
14683 Py_ssize_t pindex;
14684 Py_UCS4 signchar;
14685 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014686 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014687 Py_ssize_t sublen;
14688 _PyUnicodeWriter *writer = &ctx->writer;
14689 Py_UCS4 fill;
14690
14691 fill = ' ';
14692 if (arg->sign && arg->flags & F_ZERO)
14693 fill = '0';
14694
14695 if (PyUnicode_READY(str) == -1)
14696 return -1;
14697
14698 len = PyUnicode_GET_LENGTH(str);
14699 if ((arg->width == -1 || arg->width <= len)
14700 && (arg->prec == -1 || arg->prec >= len)
14701 && !(arg->flags & (F_SIGN | F_BLANK)))
14702 {
14703 /* Fast path */
14704 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14705 return -1;
14706 return 0;
14707 }
14708
14709 /* Truncate the string for "s", "r" and "a" formats
14710 if the precision is set */
14711 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14712 if (arg->prec >= 0 && len > arg->prec)
14713 len = arg->prec;
14714 }
14715
14716 /* Adjust sign and width */
14717 kind = PyUnicode_KIND(str);
14718 pbuf = PyUnicode_DATA(str);
14719 pindex = 0;
14720 signchar = '\0';
14721 if (arg->sign) {
14722 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14723 if (ch == '-' || ch == '+') {
14724 signchar = ch;
14725 len--;
14726 pindex++;
14727 }
14728 else if (arg->flags & F_SIGN)
14729 signchar = '+';
14730 else if (arg->flags & F_BLANK)
14731 signchar = ' ';
14732 else
14733 arg->sign = 0;
14734 }
14735 if (arg->width < len)
14736 arg->width = len;
14737
14738 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014739 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014740 if (!(arg->flags & F_LJUST)) {
14741 if (arg->sign) {
14742 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014743 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014744 }
14745 else {
14746 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014747 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014748 }
14749 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014750 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14751 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014752 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014753 }
14754
Victor Stinnera47082312012-10-04 02:19:54 +020014755 buflen = arg->width;
14756 if (arg->sign && len == arg->width)
14757 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014758 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014759 return -1;
14760
14761 /* Write the sign if needed */
14762 if (arg->sign) {
14763 if (fill != ' ') {
14764 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14765 writer->pos += 1;
14766 }
14767 if (arg->width > len)
14768 arg->width--;
14769 }
14770
14771 /* Write the numeric prefix for "x", "X" and "o" formats
14772 if the alternate form is used.
14773 For example, write "0x" for the "%#x" format. */
14774 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14775 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14776 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14777 if (fill != ' ') {
14778 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14779 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14780 writer->pos += 2;
14781 pindex += 2;
14782 }
14783 arg->width -= 2;
14784 if (arg->width < 0)
14785 arg->width = 0;
14786 len -= 2;
14787 }
14788
14789 /* Pad left with the fill character if needed */
14790 if (arg->width > len && !(arg->flags & F_LJUST)) {
14791 sublen = arg->width - len;
14792 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14793 writer->pos += sublen;
14794 arg->width = len;
14795 }
14796
14797 /* If padding with spaces: write sign if needed and/or numeric prefix if
14798 the alternate form is used */
14799 if (fill == ' ') {
14800 if (arg->sign) {
14801 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14802 writer->pos += 1;
14803 }
14804 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14805 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14806 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14807 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14808 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14809 writer->pos += 2;
14810 pindex += 2;
14811 }
14812 }
14813
14814 /* Write characters */
14815 if (len) {
14816 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14817 str, pindex, len);
14818 writer->pos += len;
14819 }
14820
14821 /* Pad right with the fill character if needed */
14822 if (arg->width > len) {
14823 sublen = arg->width - len;
14824 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14825 writer->pos += sublen;
14826 }
14827 return 0;
14828}
14829
14830/* Helper of PyUnicode_Format(): format one arg.
14831 Return 0 on success, raise an exception and return -1 on error. */
14832static int
14833unicode_format_arg(struct unicode_formatter_t *ctx)
14834{
14835 struct unicode_format_arg_t arg;
14836 PyObject *str;
14837 int ret;
14838
Victor Stinner8dbd4212012-12-04 09:30:24 +010014839 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014840 if (arg.ch == '%') {
14841 ctx->fmtpos++;
14842 ctx->fmtcnt--;
14843 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14844 return -1;
14845 return 0;
14846 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014847 arg.flags = 0;
14848 arg.width = -1;
14849 arg.prec = -1;
14850 arg.sign = 0;
14851 str = NULL;
14852
Victor Stinnera47082312012-10-04 02:19:54 +020014853 ret = unicode_format_arg_parse(ctx, &arg);
14854 if (ret == -1)
14855 return -1;
14856
14857 ret = unicode_format_arg_format(ctx, &arg, &str);
14858 if (ret == -1)
14859 return -1;
14860
14861 if (ret != 1) {
14862 ret = unicode_format_arg_output(ctx, &arg, str);
14863 Py_DECREF(str);
14864 if (ret == -1)
14865 return -1;
14866 }
14867
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014868 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014869 PyErr_SetString(PyExc_TypeError,
14870 "not all arguments converted during string formatting");
14871 return -1;
14872 }
14873 return 0;
14874}
14875
Alexander Belopolsky40018472011-02-26 01:02:56 +000014876PyObject *
14877PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014878{
Victor Stinnera47082312012-10-04 02:19:54 +020014879 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014880
Guido van Rossumd57fd912000-03-10 22:53:23 +000014881 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014882 PyErr_BadInternalCall();
14883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884 }
Victor Stinnera47082312012-10-04 02:19:54 +020014885
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014886 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014887 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014888
14889 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014890 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14891 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14892 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14893 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014894
Victor Stinner8f674cc2013-04-17 23:02:17 +020014895 _PyUnicodeWriter_Init(&ctx.writer);
14896 ctx.writer.min_length = ctx.fmtcnt + 100;
14897 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014898
Guido van Rossumd57fd912000-03-10 22:53:23 +000014899 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014900 ctx.arglen = PyTuple_Size(args);
14901 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014902 }
14903 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014904 ctx.arglen = -1;
14905 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014906 }
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014908 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014909 ctx.dict = args;
14910 else
14911 ctx.dict = NULL;
14912 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014913
Victor Stinnera47082312012-10-04 02:19:54 +020014914 while (--ctx.fmtcnt >= 0) {
14915 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014916 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014917
14918 nonfmtpos = ctx.fmtpos++;
14919 while (ctx.fmtcnt >= 0 &&
14920 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14921 ctx.fmtpos++;
14922 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014923 }
Victor Stinnera47082312012-10-04 02:19:54 +020014924 if (ctx.fmtcnt < 0) {
14925 ctx.fmtpos--;
14926 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014927 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014928
Victor Stinnercfc4c132013-04-03 01:48:39 +020014929 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14930 nonfmtpos, ctx.fmtpos) < 0)
14931 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014932 }
14933 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014934 ctx.fmtpos++;
14935 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014936 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014937 }
14938 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014939
Victor Stinnera47082312012-10-04 02:19:54 +020014940 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014941 PyErr_SetString(PyExc_TypeError,
14942 "not all arguments converted during string formatting");
14943 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944 }
14945
Victor Stinnera47082312012-10-04 02:19:54 +020014946 if (ctx.args_owned) {
14947 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948 }
Victor Stinnera47082312012-10-04 02:19:54 +020014949 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014950
Benjamin Peterson29060642009-01-31 22:14:21 +000014951 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014952 _PyUnicodeWriter_Dealloc(&ctx.writer);
14953 if (ctx.args_owned) {
14954 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014955 }
14956 return NULL;
14957}
14958
Jeremy Hylton938ace62002-07-17 16:30:39 +000014959static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014960unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14961
Tim Peters6d6c1a32001-08-02 04:15:00 +000014962static PyObject *
14963unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14964{
Benjamin Peterson29060642009-01-31 22:14:21 +000014965 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014966 static char *kwlist[] = {"object", "encoding", "errors", 0};
14967 char *encoding = NULL;
14968 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014969
Benjamin Peterson14339b62009-01-31 16:36:08 +000014970 if (type != &PyUnicode_Type)
14971 return unicode_subtype_new(type, args, kwds);
14972 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014973 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014974 return NULL;
14975 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014976 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 if (encoding == NULL && errors == NULL)
14978 return PyObject_Str(x);
14979 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014980 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014981}
14982
Guido van Rossume023fe02001-08-30 03:12:59 +000014983static PyObject *
14984unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14985{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014986 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014987 Py_ssize_t length, char_size;
14988 int share_wstr, share_utf8;
14989 unsigned int kind;
14990 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014991
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014993
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014994 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014995 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014997 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014998 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014999 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015000 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015001 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015002
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015003 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015004 if (self == NULL) {
15005 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 return NULL;
15007 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015008 kind = PyUnicode_KIND(unicode);
15009 length = PyUnicode_GET_LENGTH(unicode);
15010
15011 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015012#ifdef Py_DEBUG
15013 _PyUnicode_HASH(self) = -1;
15014#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015015 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015016#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015017 _PyUnicode_STATE(self).interned = 0;
15018 _PyUnicode_STATE(self).kind = kind;
15019 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015020 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015021 _PyUnicode_STATE(self).ready = 1;
15022 _PyUnicode_WSTR(self) = NULL;
15023 _PyUnicode_UTF8_LENGTH(self) = 0;
15024 _PyUnicode_UTF8(self) = NULL;
15025 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015026 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015027
15028 share_utf8 = 0;
15029 share_wstr = 0;
15030 if (kind == PyUnicode_1BYTE_KIND) {
15031 char_size = 1;
15032 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15033 share_utf8 = 1;
15034 }
15035 else if (kind == PyUnicode_2BYTE_KIND) {
15036 char_size = 2;
15037 if (sizeof(wchar_t) == 2)
15038 share_wstr = 1;
15039 }
15040 else {
15041 assert(kind == PyUnicode_4BYTE_KIND);
15042 char_size = 4;
15043 if (sizeof(wchar_t) == 4)
15044 share_wstr = 1;
15045 }
15046
15047 /* Ensure we won't overflow the length. */
15048 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15049 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015050 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052 data = PyObject_MALLOC((length + 1) * char_size);
15053 if (data == NULL) {
15054 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015055 goto onError;
15056 }
15057
Victor Stinnerc3c74152011-10-02 20:39:55 +020015058 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015059 if (share_utf8) {
15060 _PyUnicode_UTF8_LENGTH(self) = length;
15061 _PyUnicode_UTF8(self) = data;
15062 }
15063 if (share_wstr) {
15064 _PyUnicode_WSTR_LENGTH(self) = length;
15065 _PyUnicode_WSTR(self) = (wchar_t *)data;
15066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015067
Christian Heimesf051e432016-09-13 20:22:02 +020015068 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015069 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015070 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015071#ifdef Py_DEBUG
15072 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15073#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015074 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015075 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076
15077onError:
15078 Py_DECREF(unicode);
15079 Py_DECREF(self);
15080 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015081}
15082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015083PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015084"str(object='') -> str\n\
15085str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015086\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015087Create a new string object from the given object. If encoding or\n\
15088errors is specified, then the object must expose a data buffer\n\
15089that will be decoded using the given encoding and error handler.\n\
15090Otherwise, returns the result of object.__str__() (if defined)\n\
15091or repr(object).\n\
15092encoding defaults to sys.getdefaultencoding().\n\
15093errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015094
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015095static PyObject *unicode_iter(PyObject *seq);
15096
Guido van Rossumd57fd912000-03-10 22:53:23 +000015097PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015098 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015099 "str", /* tp_name */
15100 sizeof(PyUnicodeObject), /* tp_basicsize */
15101 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015102 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015103 (destructor)unicode_dealloc, /* tp_dealloc */
15104 0, /* tp_print */
15105 0, /* tp_getattr */
15106 0, /* tp_setattr */
15107 0, /* tp_reserved */
15108 unicode_repr, /* tp_repr */
15109 &unicode_as_number, /* tp_as_number */
15110 &unicode_as_sequence, /* tp_as_sequence */
15111 &unicode_as_mapping, /* tp_as_mapping */
15112 (hashfunc) unicode_hash, /* tp_hash*/
15113 0, /* tp_call*/
15114 (reprfunc) unicode_str, /* tp_str */
15115 PyObject_GenericGetAttr, /* tp_getattro */
15116 0, /* tp_setattro */
15117 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015119 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15120 unicode_doc, /* tp_doc */
15121 0, /* tp_traverse */
15122 0, /* tp_clear */
15123 PyUnicode_RichCompare, /* tp_richcompare */
15124 0, /* tp_weaklistoffset */
15125 unicode_iter, /* tp_iter */
15126 0, /* tp_iternext */
15127 unicode_methods, /* tp_methods */
15128 0, /* tp_members */
15129 0, /* tp_getset */
15130 &PyBaseObject_Type, /* tp_base */
15131 0, /* tp_dict */
15132 0, /* tp_descr_get */
15133 0, /* tp_descr_set */
15134 0, /* tp_dictoffset */
15135 0, /* tp_init */
15136 0, /* tp_alloc */
15137 unicode_new, /* tp_new */
15138 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015139};
15140
15141/* Initialize the Unicode implementation */
15142
Victor Stinner3a50e702011-10-18 21:21:00 +020015143int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015144{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015145 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015146 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015147 0x000A, /* LINE FEED */
15148 0x000D, /* CARRIAGE RETURN */
15149 0x001C, /* FILE SEPARATOR */
15150 0x001D, /* GROUP SEPARATOR */
15151 0x001E, /* RECORD SEPARATOR */
15152 0x0085, /* NEXT LINE */
15153 0x2028, /* LINE SEPARATOR */
15154 0x2029, /* PARAGRAPH SEPARATOR */
15155 };
15156
Fred Drakee4315f52000-05-09 19:53:39 +000015157 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015158 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015159 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015160 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015161 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015162
Guido van Rossumcacfc072002-05-24 19:01:59 +000015163 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015164 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015165
15166 /* initialize the linebreak bloom filter */
15167 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015168 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015169 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015170
Christian Heimes26532f72013-07-20 14:57:16 +020015171 if (PyType_Ready(&EncodingMapType) < 0)
15172 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015173
Benjamin Petersonc4311282012-10-30 23:21:10 -040015174 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15175 Py_FatalError("Can't initialize field name iterator type");
15176
15177 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15178 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015179
Victor Stinner3a50e702011-10-18 21:21:00 +020015180 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015181}
15182
15183/* Finalize the Unicode implementation */
15184
Christian Heimesa156e092008-02-16 07:38:31 +000015185int
15186PyUnicode_ClearFreeList(void)
15187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015188 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015189}
15190
Guido van Rossumd57fd912000-03-10 22:53:23 +000015191void
Thomas Wouters78890102000-07-22 19:25:51 +000015192_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015193{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015194 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015195
Serhiy Storchaka05997252013-01-26 12:14:02 +020015196 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015197
Serhiy Storchaka05997252013-01-26 12:14:02 +020015198 for (i = 0; i < 256; i++)
15199 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015200 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015201 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015202}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015203
Walter Dörwald16807132007-05-25 13:52:07 +000015204void
15205PyUnicode_InternInPlace(PyObject **p)
15206{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015207 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015209#ifdef Py_DEBUG
15210 assert(s != NULL);
15211 assert(_PyUnicode_CHECK(s));
15212#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015214 return;
15215#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 /* If it's a subclass, we don't really know what putting
15217 it in the interned dict might do. */
15218 if (!PyUnicode_CheckExact(s))
15219 return;
15220 if (PyUnicode_CHECK_INTERNED(s))
15221 return;
15222 if (interned == NULL) {
15223 interned = PyDict_New();
15224 if (interned == NULL) {
15225 PyErr_Clear(); /* Don't leave an exception */
15226 return;
15227 }
15228 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015230 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015232 if (t == NULL) {
15233 PyErr_Clear();
15234 return;
15235 }
15236 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015237 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015238 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015239 return;
15240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 /* The two references in interned are not counted by refcnt.
15242 The deallocator will take care of this */
15243 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015244 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015245}
15246
15247void
15248PyUnicode_InternImmortal(PyObject **p)
15249{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 PyUnicode_InternInPlace(p);
15251 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015252 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 Py_INCREF(*p);
15254 }
Walter Dörwald16807132007-05-25 13:52:07 +000015255}
15256
15257PyObject *
15258PyUnicode_InternFromString(const char *cp)
15259{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 PyObject *s = PyUnicode_FromString(cp);
15261 if (s == NULL)
15262 return NULL;
15263 PyUnicode_InternInPlace(&s);
15264 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015265}
15266
Alexander Belopolsky40018472011-02-26 01:02:56 +000015267void
15268_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015269{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015271 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 Py_ssize_t i, n;
15273 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015274
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 if (interned == NULL || !PyDict_Check(interned))
15276 return;
15277 keys = PyDict_Keys(interned);
15278 if (keys == NULL || !PyList_Check(keys)) {
15279 PyErr_Clear();
15280 return;
15281 }
Walter Dörwald16807132007-05-25 13:52:07 +000015282
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15284 detector, interned unicode strings are not forcibly deallocated;
15285 rather, we give them their stolen references back, and then clear
15286 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015287
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 n = PyList_GET_SIZE(keys);
15289 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015290 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015292 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015293 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015294 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015296 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 case SSTATE_NOT_INTERNED:
15298 /* XXX Shouldn't happen */
15299 break;
15300 case SSTATE_INTERNED_IMMORTAL:
15301 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015302 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 break;
15304 case SSTATE_INTERNED_MORTAL:
15305 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 break;
15308 default:
15309 Py_FatalError("Inconsistent interned string state.");
15310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015311 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 }
15313 fprintf(stderr, "total size of all interned strings: "
15314 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15315 "mortal/immortal\n", mortal_size, immortal_size);
15316 Py_DECREF(keys);
15317 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015318 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015319}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015320
15321
15322/********************* Unicode Iterator **************************/
15323
15324typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 PyObject_HEAD
15326 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015327 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015328} unicodeiterobject;
15329
15330static void
15331unicodeiter_dealloc(unicodeiterobject *it)
15332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015333 _PyObject_GC_UNTRACK(it);
15334 Py_XDECREF(it->it_seq);
15335 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015336}
15337
15338static int
15339unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15340{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 Py_VISIT(it->it_seq);
15342 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015343}
15344
15345static PyObject *
15346unicodeiter_next(unicodeiterobject *it)
15347{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015348 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015349
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 assert(it != NULL);
15351 seq = it->it_seq;
15352 if (seq == NULL)
15353 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015354 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015356 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15357 int kind = PyUnicode_KIND(seq);
15358 void *data = PyUnicode_DATA(seq);
15359 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15360 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 if (item != NULL)
15362 ++it->it_index;
15363 return item;
15364 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015365
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015367 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015368 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015369}
15370
15371static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015372unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 Py_ssize_t len = 0;
15375 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015376 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015378}
15379
15380PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15381
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015382static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015383unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015384{
15385 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015386 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015387 it->it_seq, it->it_index);
15388 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015389 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015390 if (u == NULL)
15391 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015392 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015393 }
15394}
15395
15396PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15397
15398static PyObject *
15399unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15400{
15401 Py_ssize_t index = PyLong_AsSsize_t(state);
15402 if (index == -1 && PyErr_Occurred())
15403 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015404 if (it->it_seq != NULL) {
15405 if (index < 0)
15406 index = 0;
15407 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15408 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15409 it->it_index = index;
15410 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015411 Py_RETURN_NONE;
15412}
15413
15414PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15415
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015418 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015419 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15420 reduce_doc},
15421 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15422 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424};
15425
15426PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15428 "str_iterator", /* tp_name */
15429 sizeof(unicodeiterobject), /* tp_basicsize */
15430 0, /* tp_itemsize */
15431 /* methods */
15432 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15433 0, /* tp_print */
15434 0, /* tp_getattr */
15435 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015436 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 0, /* tp_repr */
15438 0, /* tp_as_number */
15439 0, /* tp_as_sequence */
15440 0, /* tp_as_mapping */
15441 0, /* tp_hash */
15442 0, /* tp_call */
15443 0, /* tp_str */
15444 PyObject_GenericGetAttr, /* tp_getattro */
15445 0, /* tp_setattro */
15446 0, /* tp_as_buffer */
15447 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15448 0, /* tp_doc */
15449 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15450 0, /* tp_clear */
15451 0, /* tp_richcompare */
15452 0, /* tp_weaklistoffset */
15453 PyObject_SelfIter, /* tp_iter */
15454 (iternextfunc)unicodeiter_next, /* tp_iternext */
15455 unicodeiter_methods, /* tp_methods */
15456 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457};
15458
15459static PyObject *
15460unicode_iter(PyObject *seq)
15461{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015463
Benjamin Peterson14339b62009-01-31 16:36:08 +000015464 if (!PyUnicode_Check(seq)) {
15465 PyErr_BadInternalCall();
15466 return NULL;
15467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015468 if (PyUnicode_READY(seq) == -1)
15469 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15471 if (it == NULL)
15472 return NULL;
15473 it->it_index = 0;
15474 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015475 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 _PyObject_GC_TRACK(it);
15477 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015478}
15479
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015480
15481size_t
15482Py_UNICODE_strlen(const Py_UNICODE *u)
15483{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015484 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015485}
15486
15487Py_UNICODE*
15488Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15489{
15490 Py_UNICODE *u = s1;
15491 while ((*u++ = *s2++));
15492 return s1;
15493}
15494
15495Py_UNICODE*
15496Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15497{
15498 Py_UNICODE *u = s1;
15499 while ((*u++ = *s2++))
15500 if (n-- == 0)
15501 break;
15502 return s1;
15503}
15504
15505Py_UNICODE*
15506Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15507{
15508 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015509 u1 += wcslen(u1);
15510 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015511 return s1;
15512}
15513
15514int
15515Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15516{
15517 while (*s1 && *s2 && *s1 == *s2)
15518 s1++, s2++;
15519 if (*s1 && *s2)
15520 return (*s1 < *s2) ? -1 : +1;
15521 if (*s1)
15522 return 1;
15523 if (*s2)
15524 return -1;
15525 return 0;
15526}
15527
15528int
15529Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15530{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015531 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015532 for (; n != 0; n--) {
15533 u1 = *s1;
15534 u2 = *s2;
15535 if (u1 != u2)
15536 return (u1 < u2) ? -1 : +1;
15537 if (u1 == '\0')
15538 return 0;
15539 s1++;
15540 s2++;
15541 }
15542 return 0;
15543}
15544
15545Py_UNICODE*
15546Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15547{
15548 const Py_UNICODE *p;
15549 for (p = s; *p; p++)
15550 if (*p == c)
15551 return (Py_UNICODE*)p;
15552 return NULL;
15553}
15554
15555Py_UNICODE*
15556Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15557{
15558 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015559 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015560 while (p != s) {
15561 p--;
15562 if (*p == c)
15563 return (Py_UNICODE*)p;
15564 }
15565 return NULL;
15566}
Victor Stinner331ea922010-08-10 16:37:20 +000015567
Victor Stinner71133ff2010-09-01 23:43:53 +000015568Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015569PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015570{
Victor Stinner577db2c2011-10-11 22:12:48 +020015571 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015572 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015574 if (!PyUnicode_Check(unicode)) {
15575 PyErr_BadArgument();
15576 return NULL;
15577 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015578 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015579 if (u == NULL)
15580 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015581 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015582 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015583 PyErr_NoMemory();
15584 return NULL;
15585 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015586 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015587 size *= sizeof(Py_UNICODE);
15588 copy = PyMem_Malloc(size);
15589 if (copy == NULL) {
15590 PyErr_NoMemory();
15591 return NULL;
15592 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015593 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015594 return copy;
15595}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015596
Georg Brandl66c221e2010-10-14 07:04:07 +000015597/* A _string module, to export formatter_parser and formatter_field_name_split
15598 to the string.Formatter class implemented in Python. */
15599
15600static PyMethodDef _string_methods[] = {
15601 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15602 METH_O, PyDoc_STR("split the argument as a field name")},
15603 {"formatter_parser", (PyCFunction) formatter_parser,
15604 METH_O, PyDoc_STR("parse the argument as a format string")},
15605 {NULL, NULL}
15606};
15607
15608static struct PyModuleDef _string_module = {
15609 PyModuleDef_HEAD_INIT,
15610 "_string",
15611 PyDoc_STR("string helper module"),
15612 0,
15613 _string_methods,
15614 NULL,
15615 NULL,
15616 NULL,
15617 NULL
15618};
15619
15620PyMODINIT_FUNC
15621PyInit__string(void)
15622{
15623 return PyModule_Create(&_string_module);
15624}
15625
15626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015627#ifdef __cplusplus
15628}
15629#endif