blob: b3a851a9f8142ba628703a5d43f4f25bff52054f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner59423e32018-11-26 13:40:01 +0100223static inline void
224unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
225 Py_ssize_t start, Py_ssize_t length)
226{
227 assert(0 <= start);
228 assert(kind != PyUnicode_WCHAR_KIND);
229 switch (kind) {
230 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100231 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100232 Py_UCS1 ch = (unsigned char)value;
233 Py_UCS1 *to = (Py_UCS1 *)data + start;
234 memset(to, ch, length);
235 break;
236 }
237 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS2 ch = (Py_UCS2)value;
240 Py_UCS2 *to = (Py_UCS2 *)data + start;
241 const Py_UCS2 *end = to + length;
242 for (; to < end; ++to) *to = ch;
243 break;
244 }
245 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS4 ch = value;
248 Py_UCS4 * to = (Py_UCS4 *)data + start;
249 const Py_UCS4 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 default: Py_UNREACHABLE();
254 }
255}
256
257
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200258/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700259static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200260_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
261
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200262/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265/* Single character Unicode strings in the Latin-1 range are being
266 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200267static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268
Christian Heimes190d79e2008-01-30 11:58:22 +0000269/* Fast detection of the most frequent whitespace characters */
270const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x000C: * FORM FEED */
276/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 1, 1, 1, 1, 1, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x001C: * FILE SEPARATOR */
280/* case 0x001D: * GROUP SEPARATOR */
281/* case 0x001E: * RECORD SEPARATOR */
282/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 1, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000289
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000298};
299
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200300/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200302static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100303static int unicode_modifiable(PyObject *unicode);
304
Victor Stinnerfe226c02011-10-03 03:52:20 +0200305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100307_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200308static PyObject *
309_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
310static PyObject *
311_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
312
313static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000315 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100316 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000317 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
318
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319static void
320raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300321 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100322 PyObject *unicode,
323 Py_ssize_t startpos, Py_ssize_t endpos,
324 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000325
Christian Heimes190d79e2008-01-30 11:58:22 +0000326/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200327static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000329/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000330/* 0x000B, * LINE TABULATION */
331/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000333 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x001C, * FILE SEPARATOR */
336/* 0x001D, * GROUP SEPARATOR */
337/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 1, 1, 1, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000343
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000352};
353
INADA Naoki3ae20562017-01-16 20:41:20 +0900354static int convert_uc(PyObject *obj, void *addr);
355
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300356#include "clinic/unicodeobject.c.h"
357
Victor Stinner3d4226a2018-08-29 22:21:32 +0200358_Py_error_handler
359_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200403#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
404
Victor Stinner910337b2011-10-03 03:20:16 +0200405 PyASCIIObject *ascii;
406 unsigned int kind;
407
Victor Stinner50fe3f82018-10-26 18:47:15 +0200408 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200409
410 ascii = (PyASCIIObject *)op;
411 kind = ascii->state.kind;
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(kind == PyUnicode_1BYTE_KIND);
415 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200416 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200418 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200419 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200420
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 if (ascii->state.compact == 1) {
422 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200423 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200424 || kind == PyUnicode_2BYTE_KIND
425 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200426 ASSERT(ascii->state.ascii == 0);
427 ASSERT(ascii->state.ready == 1);
428 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100429 }
430 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
432
433 data = unicode->data.any;
434 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->length == 0);
436 ASSERT(ascii->hash == -1);
437 ASSERT(ascii->state.compact == 0);
438 ASSERT(ascii->state.ascii == 0);
439 ASSERT(ascii->state.ready == 0);
440 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
441 ASSERT(ascii->wstr != NULL);
442 ASSERT(data == NULL);
443 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200444 }
445 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200446 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 || kind == PyUnicode_2BYTE_KIND
448 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200449 ASSERT(ascii->state.compact == 0);
450 ASSERT(ascii->state.ready == 1);
451 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200452 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200453 ASSERT (compact->utf8 == data);
454 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 }
456 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200457 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 }
460 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 if (
462#if SIZEOF_WCHAR_T == 2
463 kind == PyUnicode_2BYTE_KIND
464#else
465 kind == PyUnicode_4BYTE_KIND
466#endif
467 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200468 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(ascii->wstr == data);
470 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200473 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200474
475 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200477 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 /* check that the best kind is used */
481 if (check_content && kind != PyUnicode_WCHAR_KIND)
482 {
483 Py_ssize_t i;
484 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200485 void *data;
486 Py_UCS4 ch;
487
488 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200489 for (i=0; i < ascii->length; i++)
490 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 if (ch > maxchar)
493 maxchar = ch;
494 }
495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100496 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200497 ASSERT(maxchar >= 128);
498 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100499 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200500 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200501 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 }
Victor Stinner77faf692011-11-20 18:56:05 +0100503 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200504 ASSERT(maxchar >= 0x100);
505 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100506 }
507 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200508 ASSERT(maxchar >= 0x10000);
509 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100510 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200511 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200512 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400513 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200514
515#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516}
Victor Stinner910337b2011-10-03 03:20:16 +0200517#endif
518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002581 if (precision == -1) {
2582 length = strlen(str);
2583 }
2584 else {
2585 length = 0;
2586 while (length < precision && str[length]) {
2587 length++;
2588 }
2589 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2591 if (unicode == NULL)
2592 return -1;
2593
2594 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2595 Py_DECREF(unicode);
2596 return res;
2597}
2598
Victor Stinner96865452011-03-01 23:44:09 +00002599static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002600unicode_fromformat_arg(_PyUnicodeWriter *writer,
2601 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002602{
Victor Stinnere215d962012-10-06 23:03:36 +02002603 const char *p;
2604 Py_ssize_t len;
2605 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t width;
2607 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 int longflag;
2609 int longlongflag;
2610 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002612
2613 p = f;
2614 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002615 zeropad = 0;
2616 if (*f == '0') {
2617 zeropad = 1;
2618 f++;
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620
2621 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 width = -1;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002625 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002626 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002628 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002630 return NULL;
2631 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002633 f++;
2634 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002635 }
2636 precision = -1;
2637 if (*f == '.') {
2638 f++;
2639 if (Py_ISDIGIT((unsigned)*f)) {
2640 precision = (*f - '0');
2641 f++;
2642 while (Py_ISDIGIT((unsigned)*f)) {
2643 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2644 PyErr_SetString(PyExc_ValueError,
2645 "precision too big");
2646 return NULL;
2647 }
2648 precision = (precision * 10) + (*f - '0');
2649 f++;
2650 }
2651 }
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == '%') {
2653 /* "%.3%s" => f points to "3" */
2654 f--;
2655 }
2656 }
2657 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002659 f--;
2660 }
Victor Stinner96865452011-03-01 23:44:09 +00002661
2662 /* Handle %ld, %lu, %lld and %llu. */
2663 longflag = 0;
2664 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002665 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002666 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longflag = 1;
2669 ++f;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 longlongflag = 1;
2674 f += 2;
2675 }
Victor Stinner96865452011-03-01 23:44:09 +00002676 }
2677 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002678 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002679 size_tflag = 1;
2680 ++f;
2681 }
Victor Stinnere215d962012-10-06 23:03:36 +02002682
2683 if (f[1] == '\0')
2684 writer->overallocate = 0;
2685
2686 switch (*f) {
2687 case 'c':
2688 {
2689 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002690 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002691 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002692 "character argument not in range(0x110000)");
2693 return NULL;
2694 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002695 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002696 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002697 break;
2698 }
2699
2700 case 'i':
2701 case 'd':
2702 case 'u':
2703 case 'x':
2704 {
2705 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002706 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002708
2709 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002713 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002714 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002715 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002717 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002718 va_arg(*vargs, size_t));
2719 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, unsigned int));
2722 }
2723 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002724 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 }
2726 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002727 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002730 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002731 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002732 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002733 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002734 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002735 va_arg(*vargs, Py_ssize_t));
2736 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002737 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002738 va_arg(*vargs, int));
2739 }
2740 assert(len >= 0);
2741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 if (precision < len)
2743 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002744
2745 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002746 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2747 return NULL;
2748
Victor Stinnere215d962012-10-06 23:03:36 +02002749 if (width > precision) {
2750 Py_UCS4 fillchar;
2751 fill = width - precision;
2752 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner15a11362012-10-06 23:48:20 +02002757 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002758 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002759 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2760 return NULL;
2761 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763
Victor Stinner4a587072013-11-19 12:54:53 +01002764 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2765 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002766 break;
2767 }
2768
2769 case 'p':
2770 {
2771 char number[MAX_LONG_LONG_CHARS];
2772
2773 len = sprintf(number, "%p", va_arg(*vargs, void*));
2774 assert(len >= 0);
2775
2776 /* %p is ill-defined: ensure leading 0x. */
2777 if (number[1] == 'X')
2778 number[1] = 'x';
2779 else if (number[1] != 'x') {
2780 memmove(number + 2, number,
2781 strlen(number) + 1);
2782 number[0] = '0';
2783 number[1] = 'x';
2784 len += 2;
2785 }
2786
Victor Stinner4a587072013-11-19 12:54:53 +01002787 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002788 return NULL;
2789 break;
2790 }
2791
2792 case 's':
2793 {
2794 /* UTF-8 */
2795 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002796 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 break;
2799 }
2800
2801 case 'U':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 assert(obj && _PyUnicode_CHECK(obj));
2805
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
2808 break;
2809 }
2810
2811 case 'V':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (obj) {
2816 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
2819 }
2820 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002822 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002823 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002824 }
2825 break;
2826 }
2827
2828 case 'S':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *str;
2832 assert(obj);
2833 str = PyObject_Str(obj);
2834 if (!str)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(str);
2838 return NULL;
2839 }
2840 Py_DECREF(str);
2841 break;
2842 }
2843
2844 case 'R':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *repr;
2848 assert(obj);
2849 repr = PyObject_Repr(obj);
2850 if (!repr)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(repr);
2854 return NULL;
2855 }
2856 Py_DECREF(repr);
2857 break;
2858 }
2859
2860 case 'A':
2861 {
2862 PyObject *obj = va_arg(*vargs, PyObject *);
2863 PyObject *ascii;
2864 assert(obj);
2865 ascii = PyObject_ASCII(obj);
2866 if (!ascii)
2867 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002868 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002869 Py_DECREF(ascii);
2870 return NULL;
2871 }
2872 Py_DECREF(ascii);
2873 break;
2874 }
2875
2876 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002877 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002878 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002879 break;
2880
2881 default:
2882 /* if we stumble upon an unknown formatting code, copy the rest
2883 of the format string to the output string. (we cannot just
2884 skip the code, since there's no way to know what's in the
2885 argument list) */
2886 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002887 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002888 return NULL;
2889 f = p+len;
2890 return f;
2891 }
2892
2893 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002894 return f;
2895}
2896
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897PyObject *
2898PyUnicode_FromFormatV(const char *format, va_list vargs)
2899{
Victor Stinnere215d962012-10-06 23:03:36 +02002900 va_list vargs2;
2901 const char *f;
2902 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002903
Victor Stinner8f674cc2013-04-17 23:02:17 +02002904 _PyUnicodeWriter_Init(&writer);
2905 writer.min_length = strlen(format) + 100;
2906 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002907
Benjamin Peterson0c212142016-09-20 20:39:33 -07002908 // Copy varags to be able to pass a reference to a subfunction.
2909 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002910
2911 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002912 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 f = unicode_fromformat_arg(&writer, f, &vargs2);
2914 if (f == NULL)
2915 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002918 const char *p;
2919 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920
Victor Stinnere215d962012-10-06 23:03:36 +02002921 p = f;
2922 do
2923 {
2924 if ((unsigned char)*p > 127) {
2925 PyErr_Format(PyExc_ValueError,
2926 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2927 "string, got a non-ASCII byte: 0x%02x",
2928 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002929 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
2931 p++;
2932 }
2933 while (*p != '\0' && *p != '%');
2934 len = p - f;
2935
2936 if (*p == '\0')
2937 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002938
2939 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002940 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002941
2942 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002945 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002946 return _PyUnicodeWriter_Finish(&writer);
2947
2948 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002949 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002950 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954PyObject *
2955PyUnicode_FromFormat(const char *format, ...)
2956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 PyObject* ret;
2958 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959
2960#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002961 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002963 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002964#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002965 ret = PyUnicode_FromFormatV(format, vargs);
2966 va_end(vargs);
2967 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002968}
2969
Serhiy Storchakac46db922018-10-23 22:58:24 +03002970static Py_ssize_t
2971unicode_get_widechar_size(PyObject *unicode)
2972{
2973 Py_ssize_t res;
2974
2975 assert(unicode != NULL);
2976 assert(_PyUnicode_CHECK(unicode));
2977
2978 if (_PyUnicode_WSTR(unicode) != NULL) {
2979 return PyUnicode_WSTR_LENGTH(unicode);
2980 }
2981 assert(PyUnicode_IS_READY(unicode));
2982
2983 res = _PyUnicode_LENGTH(unicode);
2984#if SIZEOF_WCHAR_T == 2
2985 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2986 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2987 const Py_UCS4 *end = s + res;
2988 for (; s < end; ++s) {
2989 if (*s > 0xFFFF) {
2990 ++res;
2991 }
2992 }
2993 }
2994#endif
2995 return res;
2996}
2997
2998static void
2999unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3000{
3001 const wchar_t *wstr;
3002
3003 assert(unicode != NULL);
3004 assert(_PyUnicode_CHECK(unicode));
3005
3006 wstr = _PyUnicode_WSTR(unicode);
3007 if (wstr != NULL) {
3008 memcpy(w, wstr, size * sizeof(wchar_t));
3009 return;
3010 }
3011 assert(PyUnicode_IS_READY(unicode));
3012
3013 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3014 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3015 for (; size--; ++s, ++w) {
3016 *w = *s;
3017 }
3018 }
3019 else {
3020#if SIZEOF_WCHAR_T == 4
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3022 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 *w = *s;
3025 }
3026#else
3027 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3028 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3029 for (; size--; ++s, ++w) {
3030 Py_UCS4 ch = *s;
3031 if (ch > 0xFFFF) {
3032 assert(ch <= MAX_UNICODE);
3033 /* encode surrogate pair in this case */
3034 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3035 if (!size--)
3036 break;
3037 *w = Py_UNICODE_LOW_SURROGATE(ch);
3038 }
3039 else {
3040 *w = ch;
3041 }
3042 }
3043#endif
3044 }
3045}
3046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047#ifdef HAVE_WCHAR_H
3048
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003049/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003050
Victor Stinnerd88d9832011-09-06 02:00:05 +02003051 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003052 character) required to convert the unicode object. Ignore size argument.
3053
Victor Stinnerd88d9832011-09-06 02:00:05 +02003054 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003055 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003056 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003057Py_ssize_t
3058PyUnicode_AsWideChar(PyObject *unicode,
3059 wchar_t *w,
3060 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003061{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003062 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003064 if (unicode == NULL) {
3065 PyErr_BadInternalCall();
3066 return -1;
3067 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003071 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003072
3073 res = unicode_get_widechar_size(unicode);
3074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003076 }
3077
3078 if (size > res) {
3079 size = res + 1;
3080 }
3081 else {
3082 res = size;
3083 }
3084 unicode_copy_as_widechar(unicode, w, size);
3085 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003086}
3087
Victor Stinner137c34c2010-09-29 10:25:54 +00003088wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003089PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003090 Py_ssize_t *size)
3091{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003092 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003093 Py_ssize_t buflen;
3094
3095 if (unicode == NULL) {
3096 PyErr_BadInternalCall();
3097 return NULL;
3098 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003101 return NULL;
3102 }
3103
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 buflen = unicode_get_widechar_size(unicode);
3105 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003106 if (buffer == NULL) {
3107 PyErr_NoMemory();
3108 return NULL;
3109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3111 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003112 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003113 }
3114 else if (wcslen(buffer) != (size_t)buflen) {
3115 PyMem_FREE(buffer);
3116 PyErr_SetString(PyExc_ValueError,
3117 "embedded null character");
3118 return NULL;
3119 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003120 return buffer;
3121}
3122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
Alexander Belopolsky40018472011-02-26 01:02:56 +00003125PyObject *
3126PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003127{
Victor Stinner8faf8212011-12-08 22:14:11 +01003128 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 PyErr_SetString(PyExc_ValueError,
3130 "chr() arg not in range(0x110000)");
3131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003132 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003133
Victor Stinner985a82a2014-01-03 12:53:47 +01003134 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003135}
3136
Alexander Belopolsky40018472011-02-26 01:02:56 +00003137PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003138PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003140 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003142 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003143 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_INCREF(obj);
3146 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003147 }
3148 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 /* For a Unicode subtype that's not a Unicode object,
3150 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003151 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003152 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003154 "Can't convert '%.100s' object to str implicitly",
3155 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003156 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003160PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003163{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003164 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 PyErr_BadInternalCall();
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003172 /* Decoding bytes objects is the most common case and should be fast */
3173 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003174 if (PyBytes_GET_SIZE(obj) == 0)
3175 _Py_RETURN_UNICODE_EMPTY();
3176 v = PyUnicode_Decode(
3177 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3178 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003179 return v;
3180 }
3181
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 PyErr_SetString(PyExc_TypeError,
3184 "decoding str is not supported");
3185 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003186 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003188 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3189 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3190 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003191 "decoding to str: need a bytes-like object, %.80s found",
3192 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003193 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003194 }
Tim Petersced69f82003-09-16 20:30:58 +00003195
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003197 PyBuffer_Release(&buffer);
3198 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003200
Serhiy Storchaka05997252013-01-26 12:14:02 +02003201 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204}
3205
Victor Stinnerebe17e02016-10-12 13:57:45 +02003206/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3207 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3208 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003209int
3210_Py_normalize_encoding(const char *encoding,
3211 char *lower,
3212 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003214 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003215 char *l;
3216 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003217 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218
Victor Stinner942889a2016-09-05 15:40:10 -07003219 assert(encoding != NULL);
3220
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003221 e = encoding;
3222 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003223 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003224 punct = 0;
3225 while (1) {
3226 char c = *e;
3227 if (c == 0) {
3228 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003229 }
Victor Stinner942889a2016-09-05 15:40:10 -07003230
3231 if (Py_ISALNUM(c) || c == '.') {
3232 if (punct && l != lower) {
3233 if (l == l_end) {
3234 return 0;
3235 }
3236 *l++ = '_';
3237 }
3238 punct = 0;
3239
3240 if (l == l_end) {
3241 return 0;
3242 }
3243 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003246 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003247 }
Victor Stinner942889a2016-09-05 15:40:10 -07003248
3249 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003250 }
3251 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003252 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253}
3254
Alexander Belopolsky40018472011-02-26 01:02:56 +00003255PyObject *
3256PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003257 Py_ssize_t size,
3258 const char *encoding,
3259 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003260{
3261 PyObject *buffer = NULL, *unicode;
3262 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003263 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3264
3265 if (encoding == NULL) {
3266 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3267 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003268
Fred Drakee4315f52000-05-09 19:53:39 +00003269 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003270 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3271 char *lower = buflower;
3272
3273 /* Fast paths */
3274 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3275 lower += 3;
3276 if (*lower == '_') {
3277 /* Match "utf8" and "utf_8" */
3278 lower++;
3279 }
3280
3281 if (lower[0] == '8' && lower[1] == 0) {
3282 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3283 }
3284 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3285 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3286 }
3287 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3288 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3289 }
3290 }
3291 else {
3292 if (strcmp(lower, "ascii") == 0
3293 || strcmp(lower, "us_ascii") == 0) {
3294 return PyUnicode_DecodeASCII(s, size, errors);
3295 }
Steve Dowercc16be82016-09-08 10:35:16 -07003296 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003297 else if (strcmp(lower, "mbcs") == 0) {
3298 return PyUnicode_DecodeMBCS(s, size, errors);
3299 }
3300 #endif
3301 else if (strcmp(lower, "latin1") == 0
3302 || strcmp(lower, "latin_1") == 0
3303 || strcmp(lower, "iso_8859_1") == 0
3304 || strcmp(lower, "iso8859_1") == 0) {
3305 return PyUnicode_DecodeLatin1(s, size, errors);
3306 }
3307 }
Victor Stinner37296e82010-06-10 13:36:23 +00003308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
3310 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003311 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003312 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003313 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003314 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (buffer == NULL)
3316 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003317 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (unicode == NULL)
3319 goto onError;
3320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003322 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003323 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003324 encoding,
3325 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_DECREF(unicode);
3327 goto onError;
3328 }
3329 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003330 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003331
Benjamin Peterson29060642009-01-31 22:14:21 +00003332 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 Py_XDECREF(buffer);
3334 return NULL;
3335}
3336
Alexander Belopolsky40018472011-02-26 01:02:56 +00003337PyObject *
3338PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003339 const char *encoding,
3340 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003341{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003344 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 }
3346
Serhiy Storchaka00939072016-10-27 21:05:49 +03003347 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3348 "PyUnicode_AsDecodedObject() is deprecated; "
3349 "use PyCodec_Decode() to decode from str", 1) < 0)
3350 return NULL;
3351
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003352 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003354
3355 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003356 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 const char *encoding,
3362 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003363{
3364 PyObject *v;
3365
3366 if (!PyUnicode_Check(unicode)) {
3367 PyErr_BadArgument();
3368 goto onError;
3369 }
3370
Serhiy Storchaka00939072016-10-27 21:05:49 +03003371 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3372 "PyUnicode_AsDecodedUnicode() is deprecated; "
3373 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3374 return NULL;
3375
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378
3379 /* Decode via the codec registry */
3380 v = PyCodec_Decode(unicode, encoding, errors);
3381 if (v == NULL)
3382 goto onError;
3383 if (!PyUnicode_Check(v)) {
3384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003385 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003386 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003387 encoding,
3388 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_DECREF(v);
3390 goto onError;
3391 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003392 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003393
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395 return NULL;
3396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 Py_ssize_t size,
3401 const char *encoding,
3402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403{
3404 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003405
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003406 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3410 Py_DECREF(unicode);
3411 return v;
3412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
3415PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 const char *encoding,
3417 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003418{
3419 PyObject *v;
3420
3421 if (!PyUnicode_Check(unicode)) {
3422 PyErr_BadArgument();
3423 goto onError;
3424 }
3425
Serhiy Storchaka00939072016-10-27 21:05:49 +03003426 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3427 "PyUnicode_AsEncodedObject() is deprecated; "
3428 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3429 "or PyCodec_Encode() for generic encoding", 1) < 0)
3430 return NULL;
3431
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003432 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003434
3435 /* Encode via the codec registry */
3436 v = PyCodec_Encode(unicode, encoding, errors);
3437 if (v == NULL)
3438 goto onError;
3439 return v;
3440
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003442 return NULL;
3443}
3444
Victor Stinner1b579672011-12-17 05:47:23 +01003445
Victor Stinner2cba6b82018-01-10 22:46:15 +01003446static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003447unicode_encode_locale(PyObject *unicode, const char *errors,
3448 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003450 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003452 Py_ssize_t wlen;
3453 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3454 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003456 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003457
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003458 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003459 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003460 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003461 return NULL;
3462 }
3463
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003464 char *str;
3465 size_t error_pos;
3466 const char *reason;
3467 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003468 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003469 PyMem_Free(wstr);
3470
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003471 if (res != 0) {
3472 if (res == -2) {
3473 PyObject *exc;
3474 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3475 "locale", unicode,
3476 (Py_ssize_t)error_pos,
3477 (Py_ssize_t)(error_pos+1),
3478 reason);
3479 if (exc != NULL) {
3480 PyCodec_StrictErrors(exc);
3481 Py_DECREF(exc);
3482 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003483 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003484 else if (res == -3) {
3485 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3486 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003487 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003490 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003493 PyObject *bytes = PyBytes_FromString(str);
3494 PyMem_RawFree(str);
3495 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496}
3497
Victor Stinnerad158722010-10-27 00:25:46 +00003498PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003499PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3500{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003501 return unicode_encode_locale(unicode, errors, 1);
3502}
3503
3504PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003505PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003506{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003507 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003508 const _PyCoreConfig *config = &interp->core_config;
3509#if defined(__APPLE__)
3510 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3511#else
Victor Stinner793b5312011-04-27 00:24:21 +02003512 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3513 cannot use it to encode and decode filenames before it is loaded. Load
3514 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003515 implementation of the locale codec until the codec registry is
3516 initialized and the Python codec is loaded. See initfsencoding(). */
3517 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003518 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003519 config->filesystem_encoding,
3520 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003521 }
3522 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003524 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003525 }
Victor Stinnerad158722010-10-27 00:25:46 +00003526#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527}
3528
Alexander Belopolsky40018472011-02-26 01:02:56 +00003529PyObject *
3530PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003531 const char *encoding,
3532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533{
3534 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003535 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 if (!PyUnicode_Check(unicode)) {
3538 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
Fred Drakee4315f52000-05-09 19:53:39 +00003541
Victor Stinner942889a2016-09-05 15:40:10 -07003542 if (encoding == NULL) {
3543 return _PyUnicode_AsUTF8String(unicode, errors);
3544 }
3545
Fred Drakee4315f52000-05-09 19:53:39 +00003546 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003547 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3548 char *lower = buflower;
3549
3550 /* Fast paths */
3551 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3552 lower += 3;
3553 if (*lower == '_') {
3554 /* Match "utf8" and "utf_8" */
3555 lower++;
3556 }
3557
3558 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003560 }
3561 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3562 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3563 }
3564 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3565 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3566 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003567 }
Victor Stinner942889a2016-09-05 15:40:10 -07003568 else {
3569 if (strcmp(lower, "ascii") == 0
3570 || strcmp(lower, "us_ascii") == 0) {
3571 return _PyUnicode_AsASCIIString(unicode, errors);
3572 }
Steve Dowercc16be82016-09-08 10:35:16 -07003573#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003574 else if (strcmp(lower, "mbcs") == 0) {
3575 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3576 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003577#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003578 else if (strcmp(lower, "latin1") == 0 ||
3579 strcmp(lower, "latin_1") == 0 ||
3580 strcmp(lower, "iso_8859_1") == 0 ||
3581 strcmp(lower, "iso8859_1") == 0) {
3582 return _PyUnicode_AsLatin1String(unicode, errors);
3583 }
3584 }
Victor Stinner37296e82010-06-10 13:36:23 +00003585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
3587 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003588 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003590 return NULL;
3591
3592 /* The normal path */
3593 if (PyBytes_Check(v))
3594 return v;
3595
3596 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003597 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003598 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003599 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003600
3601 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003602 "encoder %s returned bytearray instead of bytes; "
3603 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003604 encoding);
3605 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 Py_DECREF(v);
3607 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003608 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003610 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3611 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 Py_DECREF(v);
3613 return b;
3614 }
3615
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003622 return NULL;
3623}
3624
Alexander Belopolsky40018472011-02-26 01:02:56 +00003625PyObject *
3626PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003627 const char *encoding,
3628 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629{
3630 PyObject *v;
3631
3632 if (!PyUnicode_Check(unicode)) {
3633 PyErr_BadArgument();
3634 goto onError;
3635 }
3636
Serhiy Storchaka00939072016-10-27 21:05:49 +03003637 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3638 "PyUnicode_AsEncodedUnicode() is deprecated; "
3639 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3640 return NULL;
3641
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644
3645 /* Encode via the codec registry */
3646 v = PyCodec_Encode(unicode, encoding, errors);
3647 if (v == NULL)
3648 goto onError;
3649 if (!PyUnicode_Check(v)) {
3650 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003651 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003653 encoding,
3654 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655 Py_DECREF(v);
3656 goto onError;
3657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003659
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 return NULL;
3662}
3663
Victor Stinner2cba6b82018-01-10 22:46:15 +01003664static PyObject*
3665unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3666 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003668 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003669
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003670 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3671 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003672 return NULL;
3673 }
3674
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003675 wchar_t *wstr;
3676 size_t wlen;
3677 const char *reason;
3678 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003679 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003680 if (res != 0) {
3681 if (res == -2) {
3682 PyObject *exc;
3683 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3684 "locale", str, len,
3685 (Py_ssize_t)wlen,
3686 (Py_ssize_t)(wlen + 1),
3687 reason);
3688 if (exc != NULL) {
3689 PyCodec_StrictErrors(exc);
3690 Py_DECREF(exc);
3691 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003693 else if (res == -3) {
3694 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3695 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003696 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003697 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003698 }
Victor Stinner2f197072011-12-17 07:08:30 +01003699 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003700 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003701
3702 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3703 PyMem_RawFree(wstr);
3704 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003705}
3706
3707PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003708PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3709 const char *errors)
3710{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003711 return unicode_decode_locale(str, len, errors, 1);
3712}
3713
3714PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003715PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003716{
3717 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719}
3720
3721
3722PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003723PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003724 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003725 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3726}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727
Christian Heimes5894ba72007-11-04 11:43:14 +00003728PyObject*
3729PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3730{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003731 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003732 const _PyCoreConfig *config = &interp->core_config;
3733#if defined(__APPLE__)
3734 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3735#else
Victor Stinner793b5312011-04-27 00:24:21 +02003736 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3737 cannot use it to encode and decode filenames before it is loaded. Load
3738 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003739 implementation of the locale codec until the codec registry is
3740 initialized and the Python codec is loaded. See initfsencoding(). */
3741 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003742 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003743 config->filesystem_encoding,
3744 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 }
3746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003748 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003749 }
Victor Stinnerad158722010-10-27 00:25:46 +00003750#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003751}
3752
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753
3754int
3755PyUnicode_FSConverter(PyObject* arg, void* addr)
3756{
Brett Cannonec6ce872016-09-06 15:50:29 -07003757 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 PyObject *output = NULL;
3759 Py_ssize_t size;
3760 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003761 if (arg == NULL) {
3762 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003763 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003764 return 1;
3765 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003766 path = PyOS_FSPath(arg);
3767 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003768 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003770 if (PyBytes_Check(path)) {
3771 output = path;
3772 }
3773 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3774 output = PyUnicode_EncodeFSDefault(path);
3775 Py_DECREF(path);
3776 if (!output) {
3777 return 0;
3778 }
3779 assert(PyBytes_Check(output));
3780 }
3781
Victor Stinner0ea2a462010-04-30 00:22:08 +00003782 size = PyBytes_GET_SIZE(output);
3783 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003784 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003785 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003786 Py_DECREF(output);
3787 return 0;
3788 }
3789 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003790 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003791}
3792
3793
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794int
3795PyUnicode_FSDecoder(PyObject* arg, void* addr)
3796{
Brett Cannona5711202016-09-06 19:36:01 -07003797 int is_buffer = 0;
3798 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003800 if (arg == NULL) {
3801 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003802 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 return 1;
3804 }
Brett Cannona5711202016-09-06 19:36:01 -07003805
3806 is_buffer = PyObject_CheckBuffer(arg);
3807 if (!is_buffer) {
3808 path = PyOS_FSPath(arg);
3809 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003810 return 0;
3811 }
Brett Cannona5711202016-09-06 19:36:01 -07003812 }
3813 else {
3814 path = arg;
3815 Py_INCREF(arg);
3816 }
3817
3818 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003819 output = path;
3820 }
3821 else if (PyBytes_Check(path) || is_buffer) {
3822 PyObject *path_bytes = NULL;
3823
3824 if (!PyBytes_Check(path) &&
3825 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003826 "path should be string, bytes, or os.PathLike, not %.200s",
3827 Py_TYPE(arg)->tp_name)) {
3828 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003829 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003830 }
3831 path_bytes = PyBytes_FromObject(path);
3832 Py_DECREF(path);
3833 if (!path_bytes) {
3834 return 0;
3835 }
3836 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3837 PyBytes_GET_SIZE(path_bytes));
3838 Py_DECREF(path_bytes);
3839 if (!output) {
3840 return 0;
3841 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003842 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003843 else {
3844 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003845 "path should be string, bytes, or os.PathLike, not %.200s",
3846 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003847 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003848 return 0;
3849 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003850 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003851 Py_DECREF(output);
3852 return 0;
3853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003855 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003856 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857 Py_DECREF(output);
3858 return 0;
3859 }
3860 *(PyObject**)addr = output;
3861 return Py_CLEANUP_SUPPORTED;
3862}
3863
3864
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003865const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003867{
Christian Heimesf3863112007-11-22 07:46:41 +00003868 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003870 if (!PyUnicode_Check(unicode)) {
3871 PyErr_BadArgument();
3872 return NULL;
3873 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003877 if (PyUnicode_UTF8(unicode) == NULL) {
3878 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003879 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 if (bytes == NULL)
3881 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3883 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003884 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 Py_DECREF(bytes);
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003889 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 PyBytes_AS_STRING(bytes),
3891 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 Py_DECREF(bytes);
3893 }
3894
3895 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003896 *psize = PyUnicode_UTF8_LENGTH(unicode);
3897 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003898}
3899
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003900const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3904}
3905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906Py_UNICODE *
3907PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 if (!PyUnicode_Check(unicode)) {
3910 PyErr_BadArgument();
3911 return NULL;
3912 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003913 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3914 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003916 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Serhiy Storchakac46db922018-10-23 22:58:24 +03003919 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3920 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3921 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003924 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3925 if (w == NULL) {
3926 PyErr_NoMemory();
3927 return NULL;
3928 }
3929 unicode_copy_as_widechar(unicode, w, wlen + 1);
3930 _PyUnicode_WSTR(unicode) = w;
3931 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3932 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 }
3934 }
3935 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003937 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938}
3939
Alexander Belopolsky40018472011-02-26 01:02:56 +00003940Py_UNICODE *
3941PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944}
3945
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003946const Py_UNICODE *
3947_PyUnicode_AsUnicode(PyObject *unicode)
3948{
3949 Py_ssize_t size;
3950 const Py_UNICODE *wstr;
3951
3952 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3953 if (wstr && wcslen(wstr) != (size_t)size) {
3954 PyErr_SetString(PyExc_ValueError, "embedded null character");
3955 return NULL;
3956 }
3957 return wstr;
3958}
3959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960
Alexander Belopolsky40018472011-02-26 01:02:56 +00003961Py_ssize_t
3962PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963{
3964 if (!PyUnicode_Check(unicode)) {
3965 PyErr_BadArgument();
3966 goto onError;
3967 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003968 if (_PyUnicode_WSTR(unicode) == NULL) {
3969 if (PyUnicode_AsUnicode(unicode) == NULL)
3970 goto onError;
3971 }
3972 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return -1;
3976}
3977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978Py_ssize_t
3979PyUnicode_GetLength(PyObject *unicode)
3980{
Victor Stinner07621332012-06-16 04:53:46 +02003981 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 PyErr_BadArgument();
3983 return -1;
3984 }
Victor Stinner07621332012-06-16 04:53:46 +02003985 if (PyUnicode_READY(unicode) == -1)
3986 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return PyUnicode_GET_LENGTH(unicode);
3988}
3989
3990Py_UCS4
3991PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3992{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003993 void *data;
3994 int kind;
3995
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003996 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003997 PyErr_BadArgument();
3998 return (Py_UCS4)-1;
3999 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004000 if (PyUnicode_READY(unicode) == -1) {
4001 return (Py_UCS4)-1;
4002 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004003 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004004 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return (Py_UCS4)-1;
4006 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004007 data = PyUnicode_DATA(unicode);
4008 kind = PyUnicode_KIND(unicode);
4009 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010}
4011
4012int
4013PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4014{
4015 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004016 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return -1;
4018 }
Victor Stinner488fa492011-12-12 00:01:39 +01004019 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004020 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004021 PyErr_SetString(PyExc_IndexError, "string index out of range");
4022 return -1;
4023 }
Victor Stinner488fa492011-12-12 00:01:39 +01004024 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004025 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004026 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4027 PyErr_SetString(PyExc_ValueError, "character out of range");
4028 return -1;
4029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4031 index, ch);
4032 return 0;
4033}
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035const char *
4036PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004037{
Victor Stinner42cb4622010-09-01 19:39:01 +00004038 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004039}
4040
Victor Stinner554f3f02010-06-16 23:33:54 +00004041/* create or adjust a UnicodeDecodeError */
4042static void
4043make_decode_exception(PyObject **exceptionObject,
4044 const char *encoding,
4045 const char *input, Py_ssize_t length,
4046 Py_ssize_t startpos, Py_ssize_t endpos,
4047 const char *reason)
4048{
4049 if (*exceptionObject == NULL) {
4050 *exceptionObject = PyUnicodeDecodeError_Create(
4051 encoding, input, length, startpos, endpos, reason);
4052 }
4053 else {
4054 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4055 goto onError;
4056 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4057 goto onError;
4058 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4059 goto onError;
4060 }
4061 return;
4062
4063onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004064 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004065}
4066
Steve Dowercc16be82016-09-08 10:35:16 -07004067#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004068static int
4069widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4070{
4071 if (newsize > *size) {
4072 wchar_t *newbuf = *buf;
4073 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4074 PyErr_NoMemory();
4075 return -1;
4076 }
4077 *buf = newbuf;
4078 }
4079 *size = newsize;
4080 return 0;
4081}
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083/* error handling callback helper:
4084 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004085 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 and adjust various state variables.
4087 return 0 on success, -1 on error
4088*/
4089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004091unicode_decode_call_errorhandler_wchar(
4092 const char *errors, PyObject **errorHandler,
4093 const char *encoding, const char *reason,
4094 const char **input, const char **inend, Py_ssize_t *startinpos,
4095 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004096 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004098 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099
4100 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004103 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 wchar_t *repwstr;
4108 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004131 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133
4134 /* Copy back the bytes variables, which might have been modified by the
4135 callback */
4136 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4137 if (!inputobj)
4138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 *input = PyBytes_AS_STRING(inputobj);
4140 insize = PyBytes_GET_SIZE(inputobj);
4141 *inend = *input + insize;
4142 /* we can DECREF safely, as the exception has another reference,
4143 so the object won't go away. */
4144 Py_DECREF(inputobj);
4145
4146 if (newpos<0)
4147 newpos = insize+newpos;
4148 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004149 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150 goto onError;
4151 }
4152
4153 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4154 if (repwstr == NULL)
4155 goto onError;
4156 /* need more space? (at least enough for what we
4157 have+the replacement+the rest of the string (starting
4158 at the new input position), so we won't have to check space
4159 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004160 requiredsize = *outpos;
4161 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4162 goto overflow;
4163 requiredsize += repwlen;
4164 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4165 goto overflow;
4166 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004167 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004169 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004171 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004173 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004175 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 *endinpos = newpos;
4178 *inptr = *input + newpos;
4179
4180 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004181 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 return 0;
4183
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004184 overflow:
4185 PyErr_SetString(PyExc_OverflowError,
4186 "decoded result is too long for a Python string");
4187
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 onError:
4189 Py_XDECREF(restuple);
4190 return -1;
4191}
Steve Dowercc16be82016-09-08 10:35:16 -07004192#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
4194static int
4195unicode_decode_call_errorhandler_writer(
4196 const char *errors, PyObject **errorHandler,
4197 const char *encoding, const char *reason,
4198 const char **input, const char **inend, Py_ssize_t *startinpos,
4199 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4200 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4201{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004202 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203
4204 PyObject *restuple = NULL;
4205 PyObject *repunicode = NULL;
4206 Py_ssize_t insize;
4207 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004208 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004209 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004211 int need_to_grow = 0;
4212 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213
4214 if (*errorHandler == NULL) {
4215 *errorHandler = PyCodec_LookupError(errors);
4216 if (*errorHandler == NULL)
4217 goto onError;
4218 }
4219
4220 make_decode_exception(exceptionObject,
4221 encoding,
4222 *input, *inend - *input,
4223 *startinpos, *endinpos,
4224 reason);
4225 if (*exceptionObject == NULL)
4226 goto onError;
4227
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004228 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 if (restuple == NULL)
4230 goto onError;
4231 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 goto onError;
4234 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004235 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004237
4238 /* Copy back the bytes variables, which might have been modified by the
4239 callback */
4240 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4241 if (!inputobj)
4242 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004243 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004244 *input = PyBytes_AS_STRING(inputobj);
4245 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004246 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004247 /* we can DECREF safely, as the exception has another reference,
4248 so the object won't go away. */
4249 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004253 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Victor Stinner170ca6f2013-04-18 00:25:28 +02004258 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004259 if (replen > 1) {
4260 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004261 need_to_grow = 1;
4262 }
4263 new_inptr = *input + newpos;
4264 if (*inend - new_inptr > remain) {
4265 /* We don't know the decoding algorithm here so we make the worst
4266 assumption that one byte decodes to one unicode character.
4267 If unfortunately one byte could decode to more unicode characters,
4268 the decoder may write out-of-bound then. Is it possible for the
4269 algorithms using this function? */
4270 writer->min_length += *inend - new_inptr - remain;
4271 need_to_grow = 1;
4272 }
4273 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004274 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004275 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004276 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4277 goto onError;
4278 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004280 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004283 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004286 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292}
4293
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294/* --- UTF-7 Codec -------------------------------------------------------- */
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* See RFC2152 for details. We encode conservatively and decode liberally. */
4297
4298/* Three simple macros defining base-64. */
4299
4300/* Is c a base-64 character? */
4301
4302#define IS_BASE64(c) \
4303 (((c) >= 'A' && (c) <= 'Z') || \
4304 ((c) >= 'a' && (c) <= 'z') || \
4305 ((c) >= '0' && (c) <= '9') || \
4306 (c) == '+' || (c) == '/')
4307
4308/* given that c is a base-64 character, what is its base-64 value? */
4309
4310#define FROM_BASE64(c) \
4311 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4312 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4313 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4314 (c) == '+' ? 62 : 63)
4315
4316/* What is the base-64 character of the bottom 6 bits of n? */
4317
4318#define TO_BASE64(n) \
4319 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4320
4321/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4322 * decoded as itself. We are permissive on decoding; the only ASCII
4323 * byte not decoding to itself is the + which begins a base64
4324 * string. */
4325
4326#define DECODE_DIRECT(c) \
4327 ((c) <= 127 && (c) != '+')
4328
4329/* The UTF-7 encoder treats ASCII characters differently according to
4330 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4331 * the above). See RFC2152. This array identifies these different
4332 * sets:
4333 * 0 : "Set D"
4334 * alphanumeric and '(),-./:?
4335 * 1 : "Set O"
4336 * !"#$%&*;<=>@[]^_`{|}
4337 * 2 : "whitespace"
4338 * ht nl cr sp
4339 * 3 : special (must be base64 encoded)
4340 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4341 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Tim Petersced69f82003-09-16 20:30:58 +00004343static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344char utf7_category[128] = {
4345/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4346 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4347/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4348 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4349/* sp ! " # $ % & ' ( ) * + , - . / */
4350 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4351/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4353/* @ A B C D E F G H I J K L M N O */
4354 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4357/* ` a b c d e f g h i j k l m n o */
4358 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4359/* p q r s t u v w x y z { | } ~ del */
4360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361};
4362
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363/* ENCODE_DIRECT: this character should be encoded as itself. The
4364 * answer depends on whether we are encoding set O as itself, and also
4365 * on whether we are encoding whitespace as itself. RFC2152 makes it
4366 * clear that the answers to these questions vary between
4367 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004368
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369#define ENCODE_DIRECT(c, directO, directWS) \
4370 ((c) < 128 && (c) > 0 && \
4371 ((utf7_category[(c)] == 0) || \
4372 (directWS && (utf7_category[(c)] == 2)) || \
4373 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375PyObject *
4376PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004377 Py_ssize_t size,
4378 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004380 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4381}
4382
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383/* The decoder. The only state we preserve is our read position,
4384 * i.e. how many characters we have consumed. So if we end in the
4385 * middle of a shift sequence we have to back off the read position
4386 * and the output to the beginning of the sequence, otherwise we lose
4387 * all the shift state (seen bits, number of bits seen, high
4388 * surrogate). */
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
4391PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004392 Py_ssize_t size,
4393 const char *errors,
4394 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t startinpos;
4398 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 const char *errmsg = "";
4402 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 unsigned int base64bits = 0;
4405 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004406 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 PyObject *errorHandler = NULL;
4408 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004410 if (size == 0) {
4411 if (consumed)
4412 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004413 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004417 _PyUnicodeWriter_Init(&writer);
4418 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419
4420 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 e = s + size;
4422
4423 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004426 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 if (inShift) { /* in a base-64 section */
4429 if (IS_BASE64(ch)) { /* consume a base-64 character */
4430 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4431 base64bits += 6;
4432 s++;
4433 if (base64bits >= 16) {
4434 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 base64bits -= 16;
4437 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004438 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (surrogate) {
4440 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004441 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4442 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004446 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004449 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004450 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
4453 }
Victor Stinner551ac952011-11-29 22:58:13 +01004454 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* first surrogate */
4456 surrogate = outCh;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004459 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
4462 }
4463 }
4464 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (base64bits > 0) { /* left-over bits */
4467 if (base64bits >= 6) {
4468 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004469 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 errmsg = "partial character in shift sequence";
4471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 else {
4474 /* Some bits remain; they should be zero */
4475 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004476 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 errmsg = "non-zero padding bits in shift sequence";
4478 goto utf7Error;
4479 }
4480 }
4481 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004482 if (surrogate && DECODE_DIRECT(ch)) {
4483 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4484 goto onError;
4485 }
4486 surrogate = 0;
4487 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* '-' is absorbed; other terminating
4489 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004490 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
4493 }
4494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 s++; /* consume '+' */
4497 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004499 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004502 else if (s < e && !IS_BASE64(*s)) {
4503 s++;
4504 errmsg = "ill-formed sequence";
4505 goto utf7Error;
4506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004509 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004512 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
4514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else {
4521 startinpos = s-starts;
4522 s++;
4523 errmsg = "unexpected special character";
4524 goto utf7Error;
4525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 errors, &errorHandler,
4531 "utf7", errmsg,
4532 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* end of string */
4538
4539 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4540 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (surrogate ||
4543 (base64bits >= 6) ||
4544 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 errors, &errorHandler,
4548 "utf7", "unterminated shift sequence",
4549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 goto onError;
4552 if (s < e)
4553 goto restart;
4554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556
4557 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004558 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004561 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.kind, writer.data, shiftOutStart);
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 _PyUnicodeWriter_Dealloc(&writer);
4567 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_XDECREF(errorHandler);
4582 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 return NULL;
4585}
4586
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589_PyUnicode_EncodeUTF7(PyObject *str,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004594 int kind;
4595 void *data;
4596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 char * out;
4603 char * start;
4604
Benjamin Petersonbac79492012-01-14 13:34:47 -05004605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004606 return NULL;
4607 kind = PyUnicode_KIND(str);
4608 data = PyUnicode_DATA(str);
4609 len = PyUnicode_GET_LENGTH(str);
4610
4611 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004615 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004616 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004617 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 if (v == NULL)
4619 return NULL;
4620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (inShift) {
4626 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4627 /* shifting out */
4628 if (base64bits) { /* output remaining bits */
4629 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4630 base64buffer = 0;
4631 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
4633 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* Characters not in the BASE64 set implicitly unshift the sequence
4635 so no '-' is required, except if the character is itself a '-' */
4636 if (IS_BASE64(ch) || ch == '-') {
4637 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 *out++ = (char) ch;
4640 }
4641 else {
4642 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* not in a shift sequence */
4646 if (ch == '+') {
4647 *out++ = '+';
4648 *out++ = '-';
4649 }
4650 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 *out++ = '+';
4655 inShift = 1;
4656 goto encode_char;
4657 }
4658 }
4659 continue;
4660encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004662 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* code first surrogate */
4665 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004666 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 while (base64bits >= 6) {
4668 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4669 base64bits -= 6;
4670 }
4671 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004672 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 base64bits += 16;
4675 base64buffer = (base64buffer << 16) | ch;
4676 while (base64bits >= 6) {
4677 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4678 base64bits -= 6;
4679 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 if (base64bits)
4682 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4683 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 if (_PyBytes_Resize(&v, out - start) < 0)
4686 return NULL;
4687 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004689PyObject *
4690PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4691 Py_ssize_t size,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
4695{
4696 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004697 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004698 if (tmp == NULL)
4699 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004700 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 base64WhiteSpace, errors);
4702 Py_DECREF(tmp);
4703 return result;
4704}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706#undef IS_BASE64
4707#undef FROM_BASE64
4708#undef TO_BASE64
4709#undef DECODE_DIRECT
4710#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712/* --- UTF-8 Codec -------------------------------------------------------- */
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
4715PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004716 Py_ssize_t size,
4717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald69652032004-09-07 20:24:22 +00004719 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4720}
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722#include "stringlib/asciilib.h"
4723#include "stringlib/codecs.h"
4724#include "stringlib/undef.h"
4725
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726#include "stringlib/ucs1lib.h"
4727#include "stringlib/codecs.h"
4728#include "stringlib/undef.h"
4729
4730#include "stringlib/ucs2lib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
4734#include "stringlib/ucs4lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738/* Mask to quickly check whether a C 'long' contains a
4739 non-ASCII, UTF8-encoded char. */
4740#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004741# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004742#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004743# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004744#else
4745# error C 'long' size should be either 4 or 8!
4746#endif
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748static Py_ssize_t
4749ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004752 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004754 /*
4755 * Issue #17237: m68k is a bit different from most architectures in
4756 * that objects do not use "natural alignment" - for example, int and
4757 * long are only aligned at 2-byte boundaries. Therefore the assert()
4758 * won't work; also, tests have shown that skipping the "optimised
4759 * version" will even speed up m68k.
4760 */
4761#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004763 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4764 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 /* Fast path, see in STRINGLIB(utf8_decode) for
4766 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004767 /* Help allocation */
4768 const char *_p = p;
4769 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (_p < aligned_end) {
4771 unsigned long value = *(const unsigned long *) _p;
4772 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 *((unsigned long *)q) = value;
4775 _p += SIZEOF_LONG;
4776 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 p = _p;
4779 while (p < end) {
4780 if ((unsigned char)*p & 0x80)
4781 break;
4782 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004787#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (p < end) {
4789 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4790 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004792 /* Help allocation */
4793 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
4797 break;
4798 _p += SIZEOF_LONG;
4799 }
4800 p = _p;
4801 if (_p == end)
4802 break;
4803 }
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 ++p;
4807 }
4808 memcpy(dest, start, p - start);
4809 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
Antoine Pitrouab868312009-01-10 15:40:25 +00004811
Victor Stinner785938e2011-12-11 20:09:03 +01004812PyObject *
4813PyUnicode_DecodeUTF8Stateful(const char *s,
4814 Py_ssize_t size,
4815 const char *errors,
4816 Py_ssize_t *consumed)
4817{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004818 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004819 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821
4822 Py_ssize_t startinpos;
4823 Py_ssize_t endinpos;
4824 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004825 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004827 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
4829 if (size == 0) {
4830 if (consumed)
4831 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004833 }
4834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4836 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004837 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 *consumed = 1;
4839 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004840 }
4841
Victor Stinner8f674cc2013-04-17 23:02:17 +02004842 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004843 writer.min_length = size;
4844 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004846
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004847 writer.pos = ascii_decode(s, end, writer.data);
4848 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 while (s < end) {
4850 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004852
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 if (PyUnicode_IS_ASCII(writer.buffer))
4855 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 } else {
4861 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 }
4864
4865 switch (ch) {
4866 case 0:
4867 if (s == end || consumed)
4868 goto End;
4869 errmsg = "unexpected end of data";
4870 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004871 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 break;
4873 case 1:
4874 errmsg = "invalid start byte";
4875 startinpos = s - starts;
4876 endinpos = startinpos + 1;
4877 break;
4878 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 case 3:
4880 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 errmsg = "invalid continuation byte";
4882 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004883 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 break;
4885 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004886 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 goto onError;
4888 continue;
4889 }
4890
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004892 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004893
4894 switch (error_handler) {
4895 case _Py_ERROR_IGNORE:
4896 s += (endinpos - startinpos);
4897 break;
4898
4899 case _Py_ERROR_REPLACE:
4900 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4901 goto onError;
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004906 {
4907 Py_ssize_t i;
4908
Victor Stinner1d65d912015-10-05 13:43:50 +02004909 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4910 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004911 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004912 ch = (Py_UCS4)(unsigned char)(starts[i]);
4913 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4914 ch + 0xdc00);
4915 writer.pos++;
4916 }
4917 s += (endinpos - startinpos);
4918 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004919 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004920
4921 default:
4922 if (unicode_decode_call_errorhandler_writer(
4923 errors, &error_handler_obj,
4924 "utf-8", errmsg,
4925 &starts, &end, &startinpos, &endinpos, &exc, &s,
4926 &writer))
4927 goto onError;
4928 }
Victor Stinner785938e2011-12-11 20:09:03 +01004929 }
4930
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 if (consumed)
4933 *consumed = s - starts;
4934
Victor Stinner1d65d912015-10-05 13:43:50 +02004935 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004937 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938
4939onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004940 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004942 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004944}
4945
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004947/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4948 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004949
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004950 On success, write a pointer to a newly allocated wide character string into
4951 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4952 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 On memory allocation failure, return -1.
4955
4956 On decoding error (if surrogateescape is zero), return -2. If wlen is
4957 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4958 is not NULL, write the decoding error message into *reason. */
4959int
4960_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004961 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004962{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004963 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004964 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 wchar_t *unicode;
4966 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967
Victor Stinner3d4226a2018-08-29 22:21:32 +02004968 int surrogateescape = 0;
4969 int surrogatepass = 0;
4970 switch (errors)
4971 {
4972 case _Py_ERROR_STRICT:
4973 break;
4974 case _Py_ERROR_SURROGATEESCAPE:
4975 surrogateescape = 1;
4976 break;
4977 case _Py_ERROR_SURROGATEPASS:
4978 surrogatepass = 1;
4979 break;
4980 default:
4981 return -3;
4982 }
4983
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984 /* Note: size will always be longer than the resulting Unicode
4985 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004986 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004987 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004988 }
4989
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004990 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004991 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004992 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004993 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994
4995 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (ch > 0xFF) {
5006#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005007 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005009 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005010 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5012 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5013#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005014 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005016 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005018 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005019
5020 if (surrogateescape) {
5021 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5022 }
5023 else {
5024 /* Is it a valid three-byte code? */
5025 if (surrogatepass
5026 && (e - s) >= 3
5027 && (s[0] & 0xf0) == 0xe0
5028 && (s[1] & 0xc0) == 0x80
5029 && (s[2] & 0xc0) == 0x80)
5030 {
5031 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5032 s += 3;
5033 unicode[outpos++] = ch;
5034 }
5035 else {
5036 PyMem_RawFree(unicode );
5037 if (reason != NULL) {
5038 switch (ch) {
5039 case 0:
5040 *reason = "unexpected end of data";
5041 break;
5042 case 1:
5043 *reason = "invalid start byte";
5044 break;
5045 /* 2, 3, 4 */
5046 default:
5047 *reason = "invalid continuation byte";
5048 break;
5049 }
5050 }
5051 if (wlen != NULL) {
5052 *wlen = s - orig_s;
5053 }
5054 return -2;
5055 }
5056 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005060 if (wlen) {
5061 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005062 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005063 *wstr = unicode;
5064 return 0;
5065}
5066
5067wchar_t*
5068_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5069{
5070 wchar_t *wstr;
5071 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5072 if (res != 0) {
5073 return NULL;
5074 }
5075 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076}
5077
Antoine Pitrouab868312009-01-10 15:40:25 +00005078
Victor Stinnere47e6982017-12-21 15:45:16 +01005079/* UTF-8 encoder using the surrogateescape error handler .
5080
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005081 On success, return 0 and write the newly allocated character string (use
5082 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005083
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005084 On encoding failure, return -2 and write the position of the invalid
5085 surrogate character into *error_pos (if error_pos is set) and the decoding
5086 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005087
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 On memory allocation failure, return -1. */
5089int
5090_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005091 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005092{
5093 const Py_ssize_t max_char_size = 4;
5094 Py_ssize_t len = wcslen(text);
5095
5096 assert(len >= 0);
5097
Victor Stinner3d4226a2018-08-29 22:21:32 +02005098 int surrogateescape = 0;
5099 int surrogatepass = 0;
5100 switch (errors)
5101 {
5102 case _Py_ERROR_STRICT:
5103 break;
5104 case _Py_ERROR_SURROGATEESCAPE:
5105 surrogateescape = 1;
5106 break;
5107 case _Py_ERROR_SURROGATEPASS:
5108 surrogatepass = 1;
5109 break;
5110 default:
5111 return -3;
5112 }
5113
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005114 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5115 return -1;
5116 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005118 if (raw_malloc) {
5119 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005120 }
5121 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005122 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005123 }
5124 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005125 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005126 }
5127
5128 char *p = bytes;
5129 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005130 for (i = 0; i < len; ) {
5131 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005132 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005133 i++;
5134#if Py_UNICODE_SIZE == 2
5135 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5136 && i < len
5137 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5138 {
5139 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5140 i++;
5141 }
5142#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005143
5144 if (ch < 0x80) {
5145 /* Encode ASCII */
5146 *p++ = (char) ch;
5147
5148 }
5149 else if (ch < 0x0800) {
5150 /* Encode Latin-1 */
5151 *p++ = (char)(0xc0 | (ch >> 6));
5152 *p++ = (char)(0x80 | (ch & 0x3f));
5153 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005154 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005155 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005157 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005158 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 if (reason != NULL) {
5161 *reason = "encoding error";
5162 }
5163 if (raw_malloc) {
5164 PyMem_RawFree(bytes);
5165 }
5166 else {
5167 PyMem_Free(bytes);
5168 }
5169 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005170 }
5171 *p++ = (char)(ch & 0xff);
5172 }
5173 else if (ch < 0x10000) {
5174 *p++ = (char)(0xe0 | (ch >> 12));
5175 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5176 *p++ = (char)(0x80 | (ch & 0x3f));
5177 }
5178 else { /* ch >= 0x10000 */
5179 assert(ch <= MAX_UNICODE);
5180 /* Encode UCS4 Unicode ordinals */
5181 *p++ = (char)(0xf0 | (ch >> 18));
5182 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5183 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5184 *p++ = (char)(0x80 | (ch & 0x3f));
5185 }
5186 }
5187 *p++ = '\0';
5188
5189 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005190 char *bytes2;
5191 if (raw_malloc) {
5192 bytes2 = PyMem_RawRealloc(bytes, final_size);
5193 }
5194 else {
5195 bytes2 = PyMem_Realloc(bytes, final_size);
5196 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005197 if (bytes2 == NULL) {
5198 if (error_pos != NULL) {
5199 *error_pos = (size_t)-1;
5200 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (raw_malloc) {
5202 PyMem_RawFree(bytes);
5203 }
5204 else {
5205 PyMem_Free(bytes);
5206 }
5207 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005208 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 *str = bytes2;
5210 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005211}
5212
5213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214/* Primary internal function which creates utf8 encoded bytes objects.
5215
5216 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005217 and allocate exactly as much space needed at the end. Else allocate the
5218 maximum possible needed (4 result bytes per Unicode character), and return
5219 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005220*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005221PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005222_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Victor Stinner6099a032011-12-18 14:22:26 +01005224 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005225 void *data;
5226 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005228 if (!PyUnicode_Check(unicode)) {
5229 PyErr_BadArgument();
5230 return NULL;
5231 }
5232
5233 if (PyUnicode_READY(unicode) == -1)
5234 return NULL;
5235
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005236 if (PyUnicode_UTF8(unicode))
5237 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5238 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005239
5240 kind = PyUnicode_KIND(unicode);
5241 data = PyUnicode_DATA(unicode);
5242 size = PyUnicode_GET_LENGTH(unicode);
5243
Benjamin Petersonead6b532011-12-20 17:23:42 -06005244 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005245 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005246 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005247 case PyUnicode_1BYTE_KIND:
5248 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5249 assert(!PyUnicode_IS_ASCII(unicode));
5250 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5251 case PyUnicode_2BYTE_KIND:
5252 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5253 case PyUnicode_4BYTE_KIND:
5254 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
Alexander Belopolsky40018472011-02-26 01:02:56 +00005258PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005259PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5260 Py_ssize_t size,
5261 const char *errors)
5262{
5263 PyObject *v, *unicode;
5264
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005265 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005266 if (unicode == NULL)
5267 return NULL;
5268 v = _PyUnicode_AsUTF8String(unicode, errors);
5269 Py_DECREF(unicode);
5270 return v;
5271}
5272
5273PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005274PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005276 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277}
5278
Walter Dörwald41980ca2007-08-16 21:55:45 +00005279/* --- UTF-32 Codec ------------------------------------------------------- */
5280
5281PyObject *
5282PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 Py_ssize_t size,
5284 const char *errors,
5285 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286{
5287 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5288}
5289
5290PyObject *
5291PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 Py_ssize_t size,
5293 const char *errors,
5294 int *byteorder,
5295 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296{
5297 const char *starts = s;
5298 Py_ssize_t startinpos;
5299 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005301 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005304 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005305 PyObject *errorHandler = NULL;
5306 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005307
Walter Dörwald41980ca2007-08-16 21:55:45 +00005308 q = (unsigned char *)s;
5309 e = q + size;
5310
5311 if (byteorder)
5312 bo = *byteorder;
5313
5314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005319 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (bom == 0x0000FEFF) {
5321 bo = -1;
5322 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 else if (bom == 0xFFFE0000) {
5325 bo = 1;
5326 q += 4;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005330 }
5331
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005335 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005336 }
5337
Victor Stinnere64322e2012-10-30 23:12:47 +01005338#ifdef WORDS_BIGENDIAN
5339 le = bo < 0;
5340#else
5341 le = bo <= 0;
5342#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005344
Victor Stinner8f674cc2013-04-17 23:02:17 +02005345 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005346 writer.min_length = (e - q + 3) / 4;
5347 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005348 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005349
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 while (1) {
5351 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005353
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005355 enum PyUnicode_Kind kind = writer.kind;
5356 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005357 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005359 if (le) {
5360 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005361 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 if (ch > maxch)
5363 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005364 if (kind != PyUnicode_1BYTE_KIND &&
5365 Py_UNICODE_IS_SURROGATE(ch))
5366 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005368 q += 4;
5369 } while (q <= last);
5370 }
5371 else {
5372 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005373 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 if (ch > maxch)
5375 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005376 if (kind != PyUnicode_1BYTE_KIND &&
5377 Py_UNICODE_IS_SURROGATE(ch))
5378 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005379 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005380 q += 4;
5381 } while (q <= last);
5382 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005383 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005384 }
5385
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005386 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005387 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005388 startinpos = ((const char *)q) - starts;
5389 endinpos = startinpos + 4;
5390 }
5391 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005392 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005394 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005396 startinpos = ((const char *)q) - starts;
5397 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005399 else {
5400 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005401 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005402 goto onError;
5403 q += 4;
5404 continue;
5405 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005406 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005407 startinpos = ((const char *)q) - starts;
5408 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005410
5411 /* The remaining input chars are ignored if the callback
5412 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005413 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005415 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005417 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005423
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424 Py_XDECREF(errorHandler);
5425 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005429 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 Py_XDECREF(errorHandler);
5431 Py_XDECREF(exc);
5432 return NULL;
5433}
5434
5435PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436_PyUnicode_EncodeUTF32(PyObject *str,
5437 const char *errors,
5438 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 enum PyUnicode_Kind kind;
5441 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005442 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005443 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005444 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005445#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005446 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005447#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005449#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005451 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 PyObject *errorHandler = NULL;
5453 PyObject *exc = NULL;
5454 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005455
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005456 if (!PyUnicode_Check(str)) {
5457 PyErr_BadArgument();
5458 return NULL;
5459 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005460 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005461 return NULL;
5462 kind = PyUnicode_KIND(str);
5463 data = PyUnicode_DATA(str);
5464 len = PyUnicode_GET_LENGTH(str);
5465
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005466 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005467 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005468 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005469 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 if (v == NULL)
5471 return NULL;
5472
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 /* output buffer is 4-bytes aligned */
5474 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005475 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005478 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005480
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005481 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 else
5486 encoding = "utf-32";
5487
5488 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005489 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5490 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005491 }
5492
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 pos = 0;
5494 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496
5497 if (kind == PyUnicode_2BYTE_KIND) {
5498 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5499 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 else {
5502 assert(kind == PyUnicode_4BYTE_KIND);
5503 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5504 &out, native_ordering);
5505 }
5506 if (pos == len)
5507 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005508
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 rep = unicode_encode_call_errorhandler(
5510 errors, &errorHandler,
5511 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005513 if (!rep)
5514 goto error;
5515
5516 if (PyBytes_Check(rep)) {
5517 repsize = PyBytes_GET_SIZE(rep);
5518 if (repsize & 3) {
5519 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005520 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 "surrogates not allowed");
5522 goto error;
5523 }
5524 moreunits = repsize / 4;
5525 }
5526 else {
5527 assert(PyUnicode_Check(rep));
5528 if (PyUnicode_READY(rep) < 0)
5529 goto error;
5530 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5531 if (!PyUnicode_IS_ASCII(rep)) {
5532 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005533 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005534 "surrogates not allowed");
5535 goto error;
5536 }
5537 }
5538
5539 /* four bytes are reserved for each surrogate */
5540 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005541 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005542 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 /* integer overflow */
5544 PyErr_NoMemory();
5545 goto error;
5546 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005547 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005548 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005549 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005550 }
5551
5552 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005553 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005554 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005557 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5558 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 }
5560
5561 Py_CLEAR(rep);
5562 }
5563
5564 /* Cut back to size actually needed. This is necessary for, for example,
5565 encoding of a string containing isolated surrogates and the 'ignore'
5566 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005567 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 if (nsize != PyBytes_GET_SIZE(v))
5569 _PyBytes_Resize(&v, nsize);
5570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005573 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005574 error:
5575 Py_XDECREF(rep);
5576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
5578 Py_XDECREF(v);
5579 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005580}
5581
Alexander Belopolsky40018472011-02-26 01:02:56 +00005582PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005583PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5584 Py_ssize_t size,
5585 const char *errors,
5586 int byteorder)
5587{
5588 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005589 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005590 if (tmp == NULL)
5591 return NULL;
5592 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5593 Py_DECREF(tmp);
5594 return result;
5595}
5596
5597PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005598PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599{
Victor Stinnerb960b342011-11-20 19:12:52 +01005600 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005601}
5602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603/* --- UTF-16 Codec ------------------------------------------------------- */
5604
Tim Peters772747b2001-08-09 22:21:55 +00005605PyObject *
5606PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 Py_ssize_t size,
5608 const char *errors,
5609 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610{
Walter Dörwald69652032004-09-07 20:24:22 +00005611 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5612}
5613
5614PyObject *
5615PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 Py_ssize_t size,
5617 const char *errors,
5618 int *byteorder,
5619 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005620{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005622 Py_ssize_t startinpos;
5623 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005626 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005628 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 PyObject *errorHandler = NULL;
5630 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005631 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Tim Peters772747b2001-08-09 22:21:55 +00005633 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005637 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005639 /* Check for BOM marks (U+FEFF) in the input and adjust current
5640 byte order setting accordingly. In native mode, the leading BOM
5641 mark is skipped, in all other modes, it is copied to the output
5642 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 if (bo == 0 && size >= 2) {
5644 const Py_UCS4 bom = (q[1] << 8) | q[0];
5645 if (bom == 0xFEFF) {
5646 q += 2;
5647 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 else if (bom == 0xFFFE) {
5650 q += 2;
5651 bo = 1;
5652 }
5653 if (byteorder)
5654 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Antoine Pitrou63065d72012-05-15 23:48:04 +02005657 if (q == e) {
5658 if (consumed)
5659 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005660 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005661 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005662
Christian Heimes743e0cd2012-10-17 23:52:17 +02005663#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005664 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005666#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005668 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005669#endif
Tim Peters772747b2001-08-09 22:21:55 +00005670
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005672 character count normally. Error handler will take care of
5673 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005674 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005675 writer.min_length = (e - q + 1) / 2;
5676 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005677 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005678
Antoine Pitrou63065d72012-05-15 23:48:04 +02005679 while (1) {
5680 Py_UCS4 ch = 0;
5681 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005682 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005683 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005685 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005687 native_ordering);
5688 else
5689 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691 native_ordering);
5692 } else if (kind == PyUnicode_2BYTE_KIND) {
5693 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005695 native_ordering);
5696 } else {
5697 assert(kind == PyUnicode_4BYTE_KIND);
5698 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005700 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005701 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703
Antoine Pitrou63065d72012-05-15 23:48:04 +02005704 switch (ch)
5705 {
5706 case 0:
5707 /* remaining byte at the end? (size should be even) */
5708 if (q == e || consumed)
5709 goto End;
5710 errmsg = "truncated data";
5711 startinpos = ((const char *)q) - starts;
5712 endinpos = ((const char *)e) - starts;
5713 break;
5714 /* The remaining input chars are ignored if the callback
5715 chooses to skip the input */
5716 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005717 q -= 2;
5718 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005719 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005720 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005721 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005722 endinpos = ((const char *)e) - starts;
5723 break;
5724 case 2:
5725 errmsg = "illegal encoding";
5726 startinpos = ((const char *)q) - 2 - starts;
5727 endinpos = startinpos + 2;
5728 break;
5729 case 3:
5730 errmsg = "illegal UTF-16 surrogate";
5731 startinpos = ((const char *)q) - 4 - starts;
5732 endinpos = startinpos + 2;
5733 break;
5734 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005735 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 continue;
5738 }
5739
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005740 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005741 errors,
5742 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005744 &starts,
5745 (const char **)&e,
5746 &startinpos,
5747 &endinpos,
5748 &exc,
5749 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 }
5753
Antoine Pitrou63065d72012-05-15 23:48:04 +02005754End:
Walter Dörwald69652032004-09-07 20:24:22 +00005755 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005760 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005763 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 Py_XDECREF(errorHandler);
5765 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
5767}
5768
Tim Peters772747b2001-08-09 22:21:55 +00005769PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005770_PyUnicode_EncodeUTF16(PyObject *str,
5771 const char *errors,
5772 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005774 enum PyUnicode_Kind kind;
5775 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005777 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005778 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005779 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005780#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005781 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005782#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005783 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005784#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785 const char *encoding;
5786 Py_ssize_t nsize, pos;
5787 PyObject *errorHandler = NULL;
5788 PyObject *exc = NULL;
5789 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005790
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005791 if (!PyUnicode_Check(str)) {
5792 PyErr_BadArgument();
5793 return NULL;
5794 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005795 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005796 return NULL;
5797 kind = PyUnicode_KIND(str);
5798 data = PyUnicode_DATA(str);
5799 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005800
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005802 if (kind == PyUnicode_4BYTE_KIND) {
5803 const Py_UCS4 *in = (const Py_UCS4 *)data;
5804 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005805 while (in < end) {
5806 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005807 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005808 }
5809 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005810 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005811 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005813 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 nsize = len + pairs + (byteorder == 0);
5815 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005816 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005820 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005821 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005822 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005823 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005824 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005825 }
5826 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005827 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005828 }
Tim Peters772747b2001-08-09 22:21:55 +00005829
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 if (kind == PyUnicode_1BYTE_KIND) {
5831 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5832 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005833 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005834
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005835 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005837 }
5838 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005840 }
5841 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005843 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844
5845 pos = 0;
5846 while (pos < len) {
5847 Py_ssize_t repsize, moreunits;
5848
5849 if (kind == PyUnicode_2BYTE_KIND) {
5850 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5851 &out, native_ordering);
5852 }
5853 else {
5854 assert(kind == PyUnicode_4BYTE_KIND);
5855 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5856 &out, native_ordering);
5857 }
5858 if (pos == len)
5859 break;
5860
5861 rep = unicode_encode_call_errorhandler(
5862 errors, &errorHandler,
5863 encoding, "surrogates not allowed",
5864 str, &exc, pos, pos + 1, &pos);
5865 if (!rep)
5866 goto error;
5867
5868 if (PyBytes_Check(rep)) {
5869 repsize = PyBytes_GET_SIZE(rep);
5870 if (repsize & 1) {
5871 raise_encode_exception(&exc, encoding,
5872 str, pos - 1, pos,
5873 "surrogates not allowed");
5874 goto error;
5875 }
5876 moreunits = repsize / 2;
5877 }
5878 else {
5879 assert(PyUnicode_Check(rep));
5880 if (PyUnicode_READY(rep) < 0)
5881 goto error;
5882 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5883 if (!PyUnicode_IS_ASCII(rep)) {
5884 raise_encode_exception(&exc, encoding,
5885 str, pos - 1, pos,
5886 "surrogates not allowed");
5887 goto error;
5888 }
5889 }
5890
5891 /* two bytes are reserved for each surrogate */
5892 if (moreunits > 1) {
5893 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005894 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005895 /* integer overflow */
5896 PyErr_NoMemory();
5897 goto error;
5898 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005899 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 goto error;
5901 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5902 }
5903
5904 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005905 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005906 out += moreunits;
5907 } else /* rep is unicode */ {
5908 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5909 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5910 &out, native_ordering);
5911 }
5912
5913 Py_CLEAR(rep);
5914 }
5915
5916 /* Cut back to size actually needed. This is necessary for, for example,
5917 encoding of a string containing isolated surrogates and the 'ignore' handler
5918 is used. */
5919 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5920 if (nsize != PyBytes_GET_SIZE(v))
5921 _PyBytes_Resize(&v, nsize);
5922 Py_XDECREF(errorHandler);
5923 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005924 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005926 error:
5927 Py_XDECREF(rep);
5928 Py_XDECREF(errorHandler);
5929 Py_XDECREF(exc);
5930 Py_XDECREF(v);
5931 return NULL;
5932#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
Alexander Belopolsky40018472011-02-26 01:02:56 +00005935PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5937 Py_ssize_t size,
5938 const char *errors,
5939 int byteorder)
5940{
5941 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005942 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 if (tmp == NULL)
5944 return NULL;
5945 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5946 Py_DECREF(tmp);
5947 return result;
5948}
5949
5950PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005951PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954}
5955
5956/* --- Unicode Escape Codec ----------------------------------------------- */
5957
Fredrik Lundh06d12682001-01-24 07:59:11 +00005958static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005959
Alexander Belopolsky40018472011-02-26 01:02:56 +00005960PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005961_PyUnicode_DecodeUnicodeEscape(const char *s,
5962 Py_ssize_t size,
5963 const char *errors,
5964 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005967 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 PyObject *errorHandler = NULL;
5970 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971
Eric V. Smith42454af2016-10-31 09:22:08 -04005972 // so we can remember if we've seen an invalid escape char or not
5973 *first_invalid_escape = NULL;
5974
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005976 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 }
5978 /* Escaped strings will always be longer than the resulting
5979 Unicode string, so we start with size here and then reduce the
5980 length after conversion to the true value.
5981 (but if the error callback returns a long replacement string
5982 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005983 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 writer.min_length = size;
5985 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5986 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 }
5988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 end = s + size;
5990 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 unsigned char c = (unsigned char) *s++;
5992 Py_UCS4 ch;
5993 int count;
5994 Py_ssize_t startinpos;
5995 Py_ssize_t endinpos;
5996 const char *message;
5997
5998#define WRITE_ASCII_CHAR(ch) \
5999 do { \
6000 assert(ch <= 127); \
6001 assert(writer.pos < writer.size); \
6002 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6003 } while(0)
6004
6005#define WRITE_CHAR(ch) \
6006 do { \
6007 if (ch <= writer.maxchar) { \
6008 assert(writer.pos < writer.size); \
6009 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6010 } \
6011 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6012 goto onError; \
6013 } \
6014 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
6016 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 if (c != '\\') {
6018 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 continue;
6020 }
6021
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 if (s >= end) {
6025 message = "\\ at end of string";
6026 goto error;
6027 }
6028 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006029
Victor Stinner62ec3312016-09-06 17:04:34 -07006030 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006031 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 case '\n': continue;
6035 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6036 case '\'': WRITE_ASCII_CHAR('\''); continue;
6037 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6038 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006039 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006040 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6041 case 't': WRITE_ASCII_CHAR('\t'); continue;
6042 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6043 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006044 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006046 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 case '0': case '1': case '2': case '3':
6051 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006052 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006053 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 ch = (ch<<3) + *s++ - '0';
6055 if (s < end && '0' <= *s && *s <= '7') {
6056 ch = (ch<<3) + *s++ - '0';
6057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 WRITE_CHAR(ch);
6060 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* hex escapes */
6063 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 message = "truncated \\xXX escape";
6067 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 message = "truncated \\uXXXX escape";
6073 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006076 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 message = "truncated \\UXXXXXXXX escape";
6079 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006081 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 ch <<= 4;
6083 if (c >= '0' && c <= '9') {
6084 ch += c - '0';
6085 }
6086 else if (c >= 'a' && c <= 'f') {
6087 ch += c - ('a' - 10);
6088 }
6089 else if (c >= 'A' && c <= 'F') {
6090 ch += c - ('A' - 10);
6091 }
6092 else {
6093 break;
6094 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006095 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006098 }
6099
6100 /* when we get here, ch is a 32-bit unicode character */
6101 if (ch > MAX_UNICODE) {
6102 message = "illegal Unicode character";
6103 goto error;
6104 }
6105
6106 WRITE_CHAR(ch);
6107 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006110 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006111 if (ucnhash_CAPI == NULL) {
6112 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006113 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6114 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 if (ucnhash_CAPI == NULL) {
6116 PyErr_SetString(
6117 PyExc_UnicodeError,
6118 "\\N escapes not supported (can't load unicodedata module)"
6119 );
6120 goto onError;
6121 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006122 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006123
6124 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006125 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006126 const char *start = ++s;
6127 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006128 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006130 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 namelen = s - start;
6132 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006133 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006134 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 ch = 0xffffffff; /* in case 'getcode' messes up */
6136 if (namelen <= INT_MAX &&
6137 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6138 &ch, 0)) {
6139 assert(ch <= MAX_UNICODE);
6140 WRITE_CHAR(ch);
6141 continue;
6142 }
6143 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006144 }
6145 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006146 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006147
6148 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006149 if (*first_invalid_escape == NULL) {
6150 *first_invalid_escape = s-1; /* Back up one char, since we've
6151 already incremented s. */
6152 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 WRITE_ASCII_CHAR('\\');
6154 WRITE_CHAR(c);
6155 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006157
6158 error:
6159 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006161 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006162 errors, &errorHandler,
6163 "unicodeescape", message,
6164 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006166 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006168 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006169
6170#undef WRITE_ASCII_CHAR
6171#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006173
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006174 Py_XDECREF(errorHandler);
6175 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006176 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006177
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006179 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 Py_XDECREF(errorHandler);
6181 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183}
6184
Eric V. Smith42454af2016-10-31 09:22:08 -04006185PyObject *
6186PyUnicode_DecodeUnicodeEscape(const char *s,
6187 Py_ssize_t size,
6188 const char *errors)
6189{
6190 const char *first_invalid_escape;
6191 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6192 &first_invalid_escape);
6193 if (result == NULL)
6194 return NULL;
6195 if (first_invalid_escape != NULL) {
6196 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6197 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006198 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006199 Py_DECREF(result);
6200 return NULL;
6201 }
6202 }
6203 return result;
6204}
6205
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006206/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
Alexander Belopolsky40018472011-02-26 01:02:56 +00006208PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Ezio Melottie7f90372012-10-05 03:33:31 +03006218 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006219 escape.
6220
Ezio Melottie7f90372012-10-05 03:33:31 +03006221 For UCS1 strings it's '\xxx', 4 bytes per source character.
6222 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6223 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006224 */
6225
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 if (!PyUnicode_Check(unicode)) {
6227 PyErr_BadArgument();
6228 return NULL;
6229 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 }
Victor Stinner358af132015-10-12 22:36:57 +02006233
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006234 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 if (len == 0) {
6236 return PyBytes_FromStringAndSize(NULL, 0);
6237 }
6238
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006239 kind = PyUnicode_KIND(unicode);
6240 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6242 bytes, and 1 byte characters 4. */
6243 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006244 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 return PyErr_NoMemory();
6246 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006247 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 if (repr == NULL) {
6249 return NULL;
6250 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006253 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006255
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 /* U+0000-U+00ff range */
6257 if (ch < 0x100) {
6258 if (ch >= ' ' && ch < 127) {
6259 if (ch != '\\') {
6260 /* Copy printable US ASCII as-is */
6261 *p++ = (char) ch;
6262 }
6263 /* Escape backslashes */
6264 else {
6265 *p++ = '\\';
6266 *p++ = '\\';
6267 }
6268 }
Victor Stinner358af132015-10-12 22:36:57 +02006269
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 /* Map special whitespace to '\t', \n', '\r' */
6271 else if (ch == '\t') {
6272 *p++ = '\\';
6273 *p++ = 't';
6274 }
6275 else if (ch == '\n') {
6276 *p++ = '\\';
6277 *p++ = 'n';
6278 }
6279 else if (ch == '\r') {
6280 *p++ = '\\';
6281 *p++ = 'r';
6282 }
6283
6284 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6285 else {
6286 *p++ = '\\';
6287 *p++ = 'x';
6288 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6289 *p++ = Py_hexdigits[ch & 0x000F];
6290 }
Tim Petersced69f82003-09-16 20:30:58 +00006291 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006292 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 *p++ = '\\';
6295 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006296 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6297 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6298 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6299 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6302 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006303
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 /* Make sure that the first two digits are zero */
6305 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006306 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 *p++ = 'U';
6308 *p++ = '0';
6309 *p++ = '0';
6310 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6311 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6312 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6313 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6314 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6315 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 assert(p - PyBytes_AS_STRING(repr) > 0);
6320 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6321 return NULL;
6322 }
6323 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324}
6325
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006327PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6328 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006331 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 }
6335
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006336 result = PyUnicode_AsUnicodeEscapeString(tmp);
6337 Py_DECREF(tmp);
6338 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339}
6340
6341/* --- Raw Unicode Escape Codec ------------------------------------------- */
6342
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343PyObject *
6344PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 Py_ssize_t size,
6346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 PyObject *errorHandler = NULL;
6352 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006353
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006355 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 /* Escaped strings will always be longer than the resulting
6359 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 length after conversion to the true value. (But decoding error
6361 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006362 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 writer.min_length = size;
6364 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6365 goto onError;
6366 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 end = s + size;
6369 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006370 unsigned char c = (unsigned char) *s++;
6371 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006372 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 Py_ssize_t startinpos;
6374 Py_ssize_t endinpos;
6375 const char *message;
6376
6377#define WRITE_CHAR(ch) \
6378 do { \
6379 if (ch <= writer.maxchar) { \
6380 assert(writer.pos < writer.size); \
6381 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6382 } \
6383 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6384 goto onError; \
6385 } \
6386 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 if (c != '\\' || s >= end) {
6390 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006392 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006393
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 c = (unsigned char) *s++;
6395 if (c == 'u') {
6396 count = 4;
6397 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 else if (c == 'U') {
6400 count = 8;
6401 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 }
6403 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 assert(writer.pos < writer.size);
6405 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6406 WRITE_CHAR(c);
6407 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006408 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 startinpos = s - starts - 2;
6410
6411 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6412 for (ch = 0; count && s < end; ++s, --count) {
6413 c = (unsigned char)*s;
6414 ch <<= 4;
6415 if (c >= '0' && c <= '9') {
6416 ch += c - '0';
6417 }
6418 else if (c >= 'a' && c <= 'f') {
6419 ch += c - ('a' - 10);
6420 }
6421 else if (c >= 'A' && c <= 'F') {
6422 ch += c - ('A' - 10);
6423 }
6424 else {
6425 break;
6426 }
6427 }
6428 if (!count) {
6429 if (ch <= MAX_UNICODE) {
6430 WRITE_CHAR(ch);
6431 continue;
6432 }
6433 message = "\\Uxxxxxxxx out of range";
6434 }
6435
6436 endinpos = s-starts;
6437 writer.min_length = end - s + writer.pos;
6438 if (unicode_decode_call_errorhandler_writer(
6439 errors, &errorHandler,
6440 "rawunicodeescape", message,
6441 &starts, &end, &startinpos, &endinpos, &exc, &s,
6442 &writer)) {
6443 goto onError;
6444 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006445 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006446
6447#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006451 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006454 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459}
6460
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006468 int kind;
6469 void *data;
6470 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006472 if (!PyUnicode_Check(unicode)) {
6473 PyErr_BadArgument();
6474 return NULL;
6475 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006477 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 kind = PyUnicode_KIND(unicode);
6480 data = PyUnicode_DATA(unicode);
6481 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 if (kind == PyUnicode_1BYTE_KIND) {
6483 return PyBytes_FromStringAndSize(data, len);
6484 }
Victor Stinner0e368262011-11-10 20:12:49 +01006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6487 bytes, and 1 byte characters 4. */
6488 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006489
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 if (len > PY_SSIZE_T_MAX / expandsize) {
6491 return PyErr_NoMemory();
6492 }
6493 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6494 if (repr == NULL) {
6495 return NULL;
6496 }
6497 if (len == 0) {
6498 return repr;
6499 }
6500
6501 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006502 for (pos = 0; pos < len; pos++) {
6503 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006504
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6506 if (ch < 0x100) {
6507 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006508 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006509 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 *p++ = '\\';
6512 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006513 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6514 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6515 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6516 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6519 else {
6520 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6521 *p++ = '\\';
6522 *p++ = 'U';
6523 *p++ = '0';
6524 *p++ = '0';
6525 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6526 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6527 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6528 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6529 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6530 *p++ = Py_hexdigits[ch & 15];
6531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006533
Victor Stinner62ec3312016-09-06 17:04:34 -07006534 assert(p > PyBytes_AS_STRING(repr));
6535 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6536 return NULL;
6537 }
6538 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Alexander Belopolsky40018472011-02-26 01:02:56 +00006541PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006542PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6543 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006545 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006546 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006547 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006548 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006549 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6550 Py_DECREF(tmp);
6551 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
6554/* --- Latin-1 Codec ------------------------------------------------------ */
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556PyObject *
6557PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006558 Py_ssize_t size,
6559 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006562 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006566static void
6567make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006568 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006569 PyObject *unicode,
6570 Py_ssize_t startpos, Py_ssize_t endpos,
6571 const char *reason)
6572{
6573 if (*exceptionObject == NULL) {
6574 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006576 encoding, unicode, startpos, endpos, reason);
6577 }
6578 else {
6579 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6580 goto onError;
6581 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6582 goto onError;
6583 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6584 goto onError;
6585 return;
6586 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006587 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006588 }
6589}
6590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006592static void
6593raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006594 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006595 PyObject *unicode,
6596 Py_ssize_t startpos, Py_ssize_t endpos,
6597 const char *reason)
6598{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006599 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 encoding, unicode, startpos, endpos, reason);
6601 if (*exceptionObject != NULL)
6602 PyCodec_StrictErrors(*exceptionObject);
6603}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604
6605/* error handling callback helper:
6606 build arguments, call the callback and check the arguments,
6607 put the result into newpos and return the replacement string, which
6608 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609static PyObject *
6610unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 PyObject **errorHandler,
6612 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 Py_ssize_t startpos, Py_ssize_t endpos,
6615 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006617 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619 PyObject *restuple;
6620 PyObject *resunicode;
6621
6622 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 }
6627
Benjamin Petersonbac79492012-01-14 13:34:47 -05006628 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 return NULL;
6630 len = PyUnicode_GET_LENGTH(unicode);
6631
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006632 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006637 restuple = PyObject_CallFunctionObjArgs(
6638 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006642 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 Py_DECREF(restuple);
6644 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006646 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 &resunicode, newpos)) {
6648 Py_DECREF(restuple);
6649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006651 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6652 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6653 Py_DECREF(restuple);
6654 return NULL;
6655 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 *newpos = len + *newpos;
6658 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006659 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 Py_DECREF(restuple);
6661 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 Py_INCREF(resunicode);
6664 Py_DECREF(restuple);
6665 return resunicode;
6666}
6667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006670 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006671 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 /* input state */
6674 Py_ssize_t pos=0, size;
6675 int kind;
6676 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 /* pointer into the output */
6678 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006679 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6680 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006681 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006683 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006684 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006685 /* output object */
6686 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687
Benjamin Petersonbac79492012-01-14 13:34:47 -05006688 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 return NULL;
6690 size = PyUnicode_GET_LENGTH(unicode);
6691 kind = PyUnicode_KIND(unicode);
6692 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 /* allocate enough for a simple encoding without
6694 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006695 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006696 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006697
6698 _PyBytesWriter_Init(&writer);
6699 str = _PyBytesWriter_Alloc(&writer, size);
6700 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006704 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006707 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006709 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006710 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006713 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006716 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006718
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006719 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006721
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006722 /* Only overallocate the buffer if it's not the last write */
6723 writer.overallocate = (collend < size);
6724
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006726 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006727 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006728
6729 switch (error_handler) {
6730 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006731 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006733
6734 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006735 memset(str, '?', collend - collstart);
6736 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006737 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006738 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 break;
Victor Stinner50149202015-09-22 00:26:54 +02006741
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006742 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006743 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006744 writer.min_size -= (collend - collstart);
6745 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006746 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006747 if (str == NULL)
6748 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006749 pos = collend;
6750 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006751
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006752 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006753 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006754 writer.min_size -= (collend - collstart);
6755 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006756 unicode, collstart, collend);
6757 if (str == NULL)
6758 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 break;
Victor Stinner50149202015-09-22 00:26:54 +02006761
Victor Stinnerc3713e92015-09-29 12:32:13 +02006762 case _Py_ERROR_SURROGATEESCAPE:
6763 for (i = collstart; i < collend; ++i) {
6764 ch = PyUnicode_READ(kind, data, i);
6765 if (ch < 0xdc80 || 0xdcff < ch) {
6766 /* Not a UTF-8b surrogate */
6767 break;
6768 }
6769 *str++ = (char)(ch - 0xdc00);
6770 ++pos;
6771 }
6772 if (i >= collend)
6773 break;
6774 collstart = pos;
6775 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006776 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006779 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6780 encoding, reason, unicode, &exc,
6781 collstart, collend, &newpos);
6782 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006784
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006785 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006786 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006787
Victor Stinner6bd525b2015-10-09 13:10:05 +02006788 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006789 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006790 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006791 PyBytes_AS_STRING(rep),
6792 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006793 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006794 else {
6795 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006796
Victor Stinner6bd525b2015-10-09 13:10:05 +02006797 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006799
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006800 if (limit == 256 ?
6801 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6802 !PyUnicode_IS_ASCII(rep))
6803 {
6804 /* Not all characters are smaller than limit */
6805 raise_encode_exception(&exc, encoding, unicode,
6806 collstart, collend, reason);
6807 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006809 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6810 str = _PyBytesWriter_WriteBytes(&writer, str,
6811 PyUnicode_DATA(rep),
6812 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006814 if (str == NULL)
6815 goto onError;
6816
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006818 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006820
6821 /* If overallocation was disabled, ensure that it was the last
6822 write. Otherwise, we missed an optimization */
6823 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006824 }
6825 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006826
Victor Stinner50149202015-09-22 00:26:54 +02006827 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006829 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006830
6831 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006833 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006834 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006835 Py_XDECREF(exc);
6836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837}
6838
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840PyObject *
6841PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006842 Py_ssize_t size,
6843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006846 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 if (unicode == NULL)
6848 return NULL;
6849 result = unicode_encode_ucs1(unicode, errors, 256);
6850 Py_DECREF(unicode);
6851 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Alexander Belopolsky40018472011-02-26 01:02:56 +00006854PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006855_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
6857 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyErr_BadArgument();
6859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006861 if (PyUnicode_READY(unicode) == -1)
6862 return NULL;
6863 /* Fast path: if it is a one-byte string, construct
6864 bytes object directly. */
6865 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6866 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6867 PyUnicode_GET_LENGTH(unicode));
6868 /* Non-Latin-1 characters present. Defer to above function to
6869 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006871}
6872
6873PyObject*
6874PyUnicode_AsLatin1String(PyObject *unicode)
6875{
6876 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877}
6878
6879/* --- 7-bit ASCII Codec -------------------------------------------------- */
6880
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881PyObject *
6882PyUnicode_DecodeASCII(const char *s,
6883 Py_ssize_t size,
6884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006887 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006888 int kind;
6889 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006890 Py_ssize_t startinpos;
6891 Py_ssize_t endinpos;
6892 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006894 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006896 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006899 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006902 if (size == 1 && (unsigned char)s[0] < 128)
6903 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904
Victor Stinner8f674cc2013-04-17 23:02:17 +02006905 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006906 writer.min_length = size;
6907 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006908 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006909
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006911 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006912 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006913 writer.pos = outpos;
6914 if (writer.pos == size)
6915 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006916
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006917 s += writer.pos;
6918 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006920 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006922 PyUnicode_WRITE(kind, data, writer.pos, c);
6923 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006925 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006927
6928 /* byte outsize range 0x00..0x7f: call the error handler */
6929
6930 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006931 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006932
6933 switch (error_handler)
6934 {
6935 case _Py_ERROR_REPLACE:
6936 case _Py_ERROR_SURROGATEESCAPE:
6937 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006938 but we may switch to UCS2 at the first write */
6939 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6940 goto onError;
6941 kind = writer.kind;
6942 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006943
6944 if (error_handler == _Py_ERROR_REPLACE)
6945 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6946 else
6947 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6948 writer.pos++;
6949 ++s;
6950 break;
6951
6952 case _Py_ERROR_IGNORE:
6953 ++s;
6954 break;
6955
6956 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 startinpos = s-starts;
6958 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006960 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 "ascii", "ordinal not in range(128)",
6962 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006963 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 kind = writer.kind;
6966 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006972
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006974 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 return NULL;
6978}
6979
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006980/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006981PyObject *
6982PyUnicode_EncodeASCII(const Py_UNICODE *p,
6983 Py_ssize_t size,
6984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006986 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006987 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006988 if (unicode == NULL)
6989 return NULL;
6990 result = unicode_encode_ucs1(unicode, errors, 128);
6991 Py_DECREF(unicode);
6992 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993}
6994
Alexander Belopolsky40018472011-02-26 01:02:56 +00006995PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006996_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
6998 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 PyErr_BadArgument();
7000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 if (PyUnicode_READY(unicode) == -1)
7003 return NULL;
7004 /* Fast path: if it is an ASCII-only string, construct bytes object
7005 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007006 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007007 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7008 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010}
7011
7012PyObject *
7013PyUnicode_AsASCIIString(PyObject *unicode)
7014{
7015 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Steve Dowercc16be82016-09-08 10:35:16 -07007018#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007019
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007020/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007021
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007022#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023#define NEED_RETRY
7024#endif
7025
Victor Stinner3a50e702011-10-18 21:21:00 +02007026#ifndef WC_ERR_INVALID_CHARS
7027# define WC_ERR_INVALID_CHARS 0x0080
7028#endif
7029
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007030static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007031code_page_name(UINT code_page, PyObject **obj)
7032{
7033 *obj = NULL;
7034 if (code_page == CP_ACP)
7035 return "mbcs";
7036 if (code_page == CP_UTF7)
7037 return "CP_UTF7";
7038 if (code_page == CP_UTF8)
7039 return "CP_UTF8";
7040
7041 *obj = PyBytes_FromFormat("cp%u", code_page);
7042 if (*obj == NULL)
7043 return NULL;
7044 return PyBytes_AS_STRING(*obj);
7045}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046
Victor Stinner3a50e702011-10-18 21:21:00 +02007047static DWORD
7048decode_code_page_flags(UINT code_page)
7049{
7050 if (code_page == CP_UTF7) {
7051 /* The CP_UTF7 decoder only supports flags=0 */
7052 return 0;
7053 }
7054 else
7055 return MB_ERR_INVALID_CHARS;
7056}
7057
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 * Decode a byte string from a Windows code page into unicode object in strict
7060 * mode.
7061 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007062 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7063 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007066decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007067 wchar_t **buf,
7068 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 const char *in,
7070 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071{
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007073 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007075
7076 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 assert(insize > 0);
7078 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7079 if (outsize <= 0)
7080 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007082 /* Extend a wchar_t* buffer */
7083 Py_ssize_t n = *bufsize; /* Get the current length */
7084 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7085 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007087 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
7089 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7091 if (outsize <= 0)
7092 goto error;
7093 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007094
Victor Stinner3a50e702011-10-18 21:21:00 +02007095error:
7096 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7097 return -2;
7098 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007099 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100}
7101
Victor Stinner3a50e702011-10-18 21:21:00 +02007102/*
7103 * Decode a byte string from a code page into unicode object with an error
7104 * handler.
7105 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007106 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 * UnicodeDecodeError exception and returns -1 on error.
7108 */
7109static int
7110decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007111 wchar_t **buf,
7112 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007113 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007114 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007115{
7116 const char *startin = in;
7117 const char *endin = in + size;
7118 const DWORD flags = decode_code_page_flags(code_page);
7119 /* Ideally, we should get reason from FormatMessage. This is the Windows
7120 2000 English version of the message. */
7121 const char *reason = "No mapping for the Unicode character exists "
7122 "in the target code page.";
7123 /* each step cannot decode more than 1 character, but a character can be
7124 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007125 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007126 int insize;
7127 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 PyObject *errorHandler = NULL;
7129 PyObject *exc = NULL;
7130 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007131 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 DWORD err;
7133 int ret = -1;
7134
7135 assert(size > 0);
7136
7137 encoding = code_page_name(code_page, &encoding_obj);
7138 if (encoding == NULL)
7139 return -1;
7140
Victor Stinner7d00cc12014-03-17 23:08:06 +01007141 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7143 UnicodeDecodeError. */
7144 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7145 if (exc != NULL) {
7146 PyCodec_StrictErrors(exc);
7147 Py_CLEAR(exc);
7148 }
7149 goto error;
7150 }
7151
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007152 /* Extend a wchar_t* buffer */
7153 Py_ssize_t n = *bufsize; /* Get the current length */
7154 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7155 PyErr_NoMemory();
7156 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007158 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7159 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007161 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162
7163 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 while (in < endin)
7165 {
7166 /* Decode a character */
7167 insize = 1;
7168 do
7169 {
7170 outsize = MultiByteToWideChar(code_page, flags,
7171 in, insize,
7172 buffer, Py_ARRAY_LENGTH(buffer));
7173 if (outsize > 0)
7174 break;
7175 err = GetLastError();
7176 if (err != ERROR_NO_UNICODE_TRANSLATION
7177 && err != ERROR_INSUFFICIENT_BUFFER)
7178 {
7179 PyErr_SetFromWindowsErr(0);
7180 goto error;
7181 }
7182 insize++;
7183 }
7184 /* 4=maximum length of a UTF-8 sequence */
7185 while (insize <= 4 && (in + insize) <= endin);
7186
7187 if (outsize <= 0) {
7188 Py_ssize_t startinpos, endinpos, outpos;
7189
Victor Stinner7d00cc12014-03-17 23:08:06 +01007190 /* last character in partial decode? */
7191 if (in + insize >= endin && !final)
7192 break;
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 startinpos = in - startin;
7195 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007196 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007197 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 errors, &errorHandler,
7199 encoding, reason,
7200 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007201 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 {
7203 goto error;
7204 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007205 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 }
7207 else {
7208 in += insize;
7209 memcpy(out, buffer, outsize * sizeof(wchar_t));
7210 out += outsize;
7211 }
7212 }
7213
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007214 /* Shrink the buffer */
7215 assert(out - *buf <= *bufsize);
7216 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007217 /* (in - startin) <= size and size is an int */
7218 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007219
7220error:
7221 Py_XDECREF(encoding_obj);
7222 Py_XDECREF(errorHandler);
7223 Py_XDECREF(exc);
7224 return ret;
7225}
7226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227static PyObject *
7228decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007229 const char *s, Py_ssize_t size,
7230 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007232 wchar_t *buf = NULL;
7233 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007234 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 if (code_page < 0) {
7237 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7238 return NULL;
7239 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007240 if (size < 0) {
7241 PyErr_BadInternalCall();
7242 return NULL;
7243 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007244
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007245 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247
Victor Stinner76a31a62011-11-04 00:05:13 +01007248 do
7249 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007250#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007251 if (size > INT_MAX) {
7252 chunk_size = INT_MAX;
7253 final = 0;
7254 done = 0;
7255 }
7256 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007258 {
7259 chunk_size = (int)size;
7260 final = (consumed == NULL);
7261 done = 1;
7262 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263
Victor Stinner76a31a62011-11-04 00:05:13 +01007264 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007265 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007266 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007267 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007268 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007270 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007271 s, chunk_size);
7272 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007273 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007274 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007275 errors, final);
7276 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007277
7278 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007279 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007280 return NULL;
7281 }
7282
7283 if (consumed)
7284 *consumed += converted;
7285
7286 s += converted;
7287 size -= converted;
7288 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007289
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007290 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7291 PyMem_Free(buf);
7292 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293}
7294
Alexander Belopolsky40018472011-02-26 01:02:56 +00007295PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007296PyUnicode_DecodeCodePageStateful(int code_page,
7297 const char *s,
7298 Py_ssize_t size,
7299 const char *errors,
7300 Py_ssize_t *consumed)
7301{
7302 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7303}
7304
7305PyObject *
7306PyUnicode_DecodeMBCSStateful(const char *s,
7307 Py_ssize_t size,
7308 const char *errors,
7309 Py_ssize_t *consumed)
7310{
7311 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7312}
7313
7314PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007315PyUnicode_DecodeMBCS(const char *s,
7316 Py_ssize_t size,
7317 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007318{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7320}
7321
Victor Stinner3a50e702011-10-18 21:21:00 +02007322static DWORD
7323encode_code_page_flags(UINT code_page, const char *errors)
7324{
7325 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007326 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 }
7328 else if (code_page == CP_UTF7) {
7329 /* CP_UTF7 only supports flags=0 */
7330 return 0;
7331 }
7332 else {
7333 if (errors != NULL && strcmp(errors, "replace") == 0)
7334 return 0;
7335 else
7336 return WC_NO_BEST_FIT_CHARS;
7337 }
7338}
7339
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 * Encode a Unicode string to a Windows code page into a byte string in strict
7342 * mode.
7343 *
7344 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007345 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007347static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007348encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007349 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007351{
Victor Stinner554f3f02010-06-16 23:33:54 +00007352 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 BOOL *pusedDefaultChar = &usedDefaultChar;
7354 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007355 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007356 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 const DWORD flags = encode_code_page_flags(code_page, NULL);
7358 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007359 /* Create a substring so that we can get the UTF-16 representation
7360 of just the slice under consideration. */
7361 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007364
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007366 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007368 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007369
Victor Stinner2fc507f2011-11-04 20:06:39 +01007370 substring = PyUnicode_Substring(unicode, offset, offset+len);
7371 if (substring == NULL)
7372 return -1;
7373 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7374 if (p == NULL) {
7375 Py_DECREF(substring);
7376 return -1;
7377 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007378 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007379
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007382 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 NULL, 0,
7384 NULL, pusedDefaultChar);
7385 if (outsize <= 0)
7386 goto error;
7387 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007388 if (pusedDefaultChar && *pusedDefaultChar) {
7389 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007391 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007396 if (*outbytes == NULL) {
7397 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007399 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401 }
7402 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 const Py_ssize_t n = PyBytes_Size(*outbytes);
7405 if (outsize > PY_SSIZE_T_MAX - n) {
7406 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007407 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007410 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7411 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007413 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415 }
7416
7417 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007419 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 out, outsize,
7421 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 if (outsize <= 0)
7424 goto error;
7425 if (pusedDefaultChar && *pusedDefaultChar)
7426 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007428
Victor Stinner3a50e702011-10-18 21:21:00 +02007429error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7432 return -2;
7433 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007434 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435}
7436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007438 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 * error handler.
7440 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007441 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 * -1 on other error.
7443 */
7444static int
7445encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007446 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007447 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007448{
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 Py_ssize_t pos = unicode_offset;
7451 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 /* Ideally, we should get reason from FormatMessage. This is the Windows
7453 2000 English version of the message. */
7454 const char *reason = "invalid character";
7455 /* 4=maximum length of a UTF-8 sequence */
7456 char buffer[4];
7457 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7458 Py_ssize_t outsize;
7459 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 PyObject *errorHandler = NULL;
7461 PyObject *exc = NULL;
7462 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007463 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 PyObject *rep;
7466 int ret = -1;
7467
7468 assert(insize > 0);
7469
7470 encoding = code_page_name(code_page, &encoding_obj);
7471 if (encoding == NULL)
7472 return -1;
7473
7474 if (errors == NULL || strcmp(errors, "strict") == 0) {
7475 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7476 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007477 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 if (exc != NULL) {
7479 PyCodec_StrictErrors(exc);
7480 Py_DECREF(exc);
7481 }
7482 Py_XDECREF(encoding_obj);
7483 return -1;
7484 }
7485
7486 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7487 pusedDefaultChar = &usedDefaultChar;
7488 else
7489 pusedDefaultChar = NULL;
7490
7491 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7492 PyErr_NoMemory();
7493 goto error;
7494 }
7495 outsize = insize * Py_ARRAY_LENGTH(buffer);
7496
7497 if (*outbytes == NULL) {
7498 /* Create string object */
7499 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7500 if (*outbytes == NULL)
7501 goto error;
7502 out = PyBytes_AS_STRING(*outbytes);
7503 }
7504 else {
7505 /* Extend string object */
7506 Py_ssize_t n = PyBytes_Size(*outbytes);
7507 if (n > PY_SSIZE_T_MAX - outsize) {
7508 PyErr_NoMemory();
7509 goto error;
7510 }
7511 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7512 goto error;
7513 out = PyBytes_AS_STRING(*outbytes) + n;
7514 }
7515
7516 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7520 wchar_t chars[2];
7521 int charsize;
7522 if (ch < 0x10000) {
7523 chars[0] = (wchar_t)ch;
7524 charsize = 1;
7525 }
7526 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007527 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7528 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007529 charsize = 2;
7530 }
7531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007533 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 buffer, Py_ARRAY_LENGTH(buffer),
7535 NULL, pusedDefaultChar);
7536 if (outsize > 0) {
7537 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7538 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007539 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 memcpy(out, buffer, outsize);
7541 out += outsize;
7542 continue;
7543 }
7544 }
7545 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7546 PyErr_SetFromWindowsErr(0);
7547 goto error;
7548 }
7549
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 rep = unicode_encode_call_errorhandler(
7551 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007553 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 if (rep == NULL)
7555 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007556 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007557
7558 if (PyBytes_Check(rep)) {
7559 outsize = PyBytes_GET_SIZE(rep);
7560 if (outsize != 1) {
7561 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7562 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7563 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7564 Py_DECREF(rep);
7565 goto error;
7566 }
7567 out = PyBytes_AS_STRING(*outbytes) + offset;
7568 }
7569 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7570 out += outsize;
7571 }
7572 else {
7573 Py_ssize_t i;
7574 enum PyUnicode_Kind kind;
7575 void *data;
7576
Benjamin Petersonbac79492012-01-14 13:34:47 -05007577 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 Py_DECREF(rep);
7579 goto error;
7580 }
7581
7582 outsize = PyUnicode_GET_LENGTH(rep);
7583 if (outsize != 1) {
7584 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7585 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7586 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7587 Py_DECREF(rep);
7588 goto error;
7589 }
7590 out = PyBytes_AS_STRING(*outbytes) + offset;
7591 }
7592 kind = PyUnicode_KIND(rep);
7593 data = PyUnicode_DATA(rep);
7594 for (i=0; i < outsize; i++) {
7595 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7596 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007597 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007598 encoding, unicode,
7599 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 "unable to encode error handler result to ASCII");
7601 Py_DECREF(rep);
7602 goto error;
7603 }
7604 *out = (unsigned char)ch;
7605 out++;
7606 }
7607 }
7608 Py_DECREF(rep);
7609 }
7610 /* write a NUL byte */
7611 *out = 0;
7612 outsize = out - PyBytes_AS_STRING(*outbytes);
7613 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7614 if (_PyBytes_Resize(outbytes, outsize) < 0)
7615 goto error;
7616 ret = 0;
7617
7618error:
7619 Py_XDECREF(encoding_obj);
7620 Py_XDECREF(errorHandler);
7621 Py_XDECREF(exc);
7622 return ret;
7623}
7624
Victor Stinner3a50e702011-10-18 21:21:00 +02007625static PyObject *
7626encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007627 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 const char *errors)
7629{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007632 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007633 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007634
Victor Stinner29dacf22015-01-26 16:41:32 +01007635 if (!PyUnicode_Check(unicode)) {
7636 PyErr_BadArgument();
7637 return NULL;
7638 }
7639
Benjamin Petersonbac79492012-01-14 13:34:47 -05007640 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 return NULL;
7642 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007643
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 if (code_page < 0) {
7645 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7646 return NULL;
7647 }
7648
Martin v. Löwis3d325192011-11-04 18:23:06 +01007649 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007650 return PyBytes_FromStringAndSize(NULL, 0);
7651
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 offset = 0;
7653 do
7654 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007656 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007657 chunks. */
7658 if (len > INT_MAX/2) {
7659 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007660 done = 0;
7661 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007662 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007663#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007664 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007666 done = 1;
7667 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007668
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007670 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007671 errors);
7672 if (ret == -2)
7673 ret = encode_code_page_errors(code_page, &outbytes,
7674 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007675 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 if (ret < 0) {
7677 Py_XDECREF(outbytes);
7678 return NULL;
7679 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007680
Victor Stinner7581cef2011-11-03 22:32:33 +01007681 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007682 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007684
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 return outbytes;
7686}
7687
7688PyObject *
7689PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7690 Py_ssize_t size,
7691 const char *errors)
7692{
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007694 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007695 if (unicode == NULL)
7696 return NULL;
7697 res = encode_code_page(CP_ACP, unicode, errors);
7698 Py_DECREF(unicode);
7699 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007700}
7701
7702PyObject *
7703PyUnicode_EncodeCodePage(int code_page,
7704 PyObject *unicode,
7705 const char *errors)
7706{
Victor Stinner7581cef2011-11-03 22:32:33 +01007707 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007708}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007709
Alexander Belopolsky40018472011-02-26 01:02:56 +00007710PyObject *
7711PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007712{
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007714}
7715
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007716#undef NEED_RETRY
7717
Steve Dowercc16be82016-09-08 10:35:16 -07007718#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007719
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720/* --- Character Mapping Codec -------------------------------------------- */
7721
Victor Stinnerfb161b12013-04-18 01:44:27 +02007722static int
7723charmap_decode_string(const char *s,
7724 Py_ssize_t size,
7725 PyObject *mapping,
7726 const char *errors,
7727 _PyUnicodeWriter *writer)
7728{
7729 const char *starts = s;
7730 const char *e;
7731 Py_ssize_t startinpos, endinpos;
7732 PyObject *errorHandler = NULL, *exc = NULL;
7733 Py_ssize_t maplen;
7734 enum PyUnicode_Kind mapkind;
7735 void *mapdata;
7736 Py_UCS4 x;
7737 unsigned char ch;
7738
7739 if (PyUnicode_READY(mapping) == -1)
7740 return -1;
7741
7742 maplen = PyUnicode_GET_LENGTH(mapping);
7743 mapdata = PyUnicode_DATA(mapping);
7744 mapkind = PyUnicode_KIND(mapping);
7745
7746 e = s + size;
7747
7748 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7749 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7750 * is disabled in encoding aliases, latin1 is preferred because
7751 * its implementation is faster. */
7752 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7753 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7754 Py_UCS4 maxchar = writer->maxchar;
7755
7756 assert (writer->kind == PyUnicode_1BYTE_KIND);
7757 while (s < e) {
7758 ch = *s;
7759 x = mapdata_ucs1[ch];
7760 if (x > maxchar) {
7761 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7762 goto onError;
7763 maxchar = writer->maxchar;
7764 outdata = (Py_UCS1 *)writer->data;
7765 }
7766 outdata[writer->pos] = x;
7767 writer->pos++;
7768 ++s;
7769 }
7770 return 0;
7771 }
7772
7773 while (s < e) {
7774 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7775 enum PyUnicode_Kind outkind = writer->kind;
7776 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7777 if (outkind == PyUnicode_1BYTE_KIND) {
7778 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7779 Py_UCS4 maxchar = writer->maxchar;
7780 while (s < e) {
7781 ch = *s;
7782 x = mapdata_ucs2[ch];
7783 if (x > maxchar)
7784 goto Error;
7785 outdata[writer->pos] = x;
7786 writer->pos++;
7787 ++s;
7788 }
7789 break;
7790 }
7791 else if (outkind == PyUnicode_2BYTE_KIND) {
7792 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7793 while (s < e) {
7794 ch = *s;
7795 x = mapdata_ucs2[ch];
7796 if (x == 0xFFFE)
7797 goto Error;
7798 outdata[writer->pos] = x;
7799 writer->pos++;
7800 ++s;
7801 }
7802 break;
7803 }
7804 }
7805 ch = *s;
7806
7807 if (ch < maplen)
7808 x = PyUnicode_READ(mapkind, mapdata, ch);
7809 else
7810 x = 0xfffe; /* invalid value */
7811Error:
7812 if (x == 0xfffe)
7813 {
7814 /* undefined mapping */
7815 startinpos = s-starts;
7816 endinpos = startinpos+1;
7817 if (unicode_decode_call_errorhandler_writer(
7818 errors, &errorHandler,
7819 "charmap", "character maps to <undefined>",
7820 &starts, &e, &startinpos, &endinpos, &exc, &s,
7821 writer)) {
7822 goto onError;
7823 }
7824 continue;
7825 }
7826
7827 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7828 goto onError;
7829 ++s;
7830 }
7831 Py_XDECREF(errorHandler);
7832 Py_XDECREF(exc);
7833 return 0;
7834
7835onError:
7836 Py_XDECREF(errorHandler);
7837 Py_XDECREF(exc);
7838 return -1;
7839}
7840
7841static int
7842charmap_decode_mapping(const char *s,
7843 Py_ssize_t size,
7844 PyObject *mapping,
7845 const char *errors,
7846 _PyUnicodeWriter *writer)
7847{
7848 const char *starts = s;
7849 const char *e;
7850 Py_ssize_t startinpos, endinpos;
7851 PyObject *errorHandler = NULL, *exc = NULL;
7852 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007853 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007854
7855 e = s + size;
7856
7857 while (s < e) {
7858 ch = *s;
7859
7860 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7861 key = PyLong_FromLong((long)ch);
7862 if (key == NULL)
7863 goto onError;
7864
7865 item = PyObject_GetItem(mapping, key);
7866 Py_DECREF(key);
7867 if (item == NULL) {
7868 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7869 /* No mapping found means: mapping is undefined. */
7870 PyErr_Clear();
7871 goto Undefined;
7872 } else
7873 goto onError;
7874 }
7875
7876 /* Apply mapping */
7877 if (item == Py_None)
7878 goto Undefined;
7879 if (PyLong_Check(item)) {
7880 long value = PyLong_AS_LONG(item);
7881 if (value == 0xFFFE)
7882 goto Undefined;
7883 if (value < 0 || value > MAX_UNICODE) {
7884 PyErr_Format(PyExc_TypeError,
7885 "character mapping must be in range(0x%lx)",
7886 (unsigned long)MAX_UNICODE + 1);
7887 goto onError;
7888 }
7889
7890 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7891 goto onError;
7892 }
7893 else if (PyUnicode_Check(item)) {
7894 if (PyUnicode_READY(item) == -1)
7895 goto onError;
7896 if (PyUnicode_GET_LENGTH(item) == 1) {
7897 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7898 if (value == 0xFFFE)
7899 goto Undefined;
7900 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7901 goto onError;
7902 }
7903 else {
7904 writer->overallocate = 1;
7905 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7906 goto onError;
7907 }
7908 }
7909 else {
7910 /* wrong return value */
7911 PyErr_SetString(PyExc_TypeError,
7912 "character mapping must return integer, None or str");
7913 goto onError;
7914 }
7915 Py_CLEAR(item);
7916 ++s;
7917 continue;
7918
7919Undefined:
7920 /* undefined mapping */
7921 Py_CLEAR(item);
7922 startinpos = s-starts;
7923 endinpos = startinpos+1;
7924 if (unicode_decode_call_errorhandler_writer(
7925 errors, &errorHandler,
7926 "charmap", "character maps to <undefined>",
7927 &starts, &e, &startinpos, &endinpos, &exc, &s,
7928 writer)) {
7929 goto onError;
7930 }
7931 }
7932 Py_XDECREF(errorHandler);
7933 Py_XDECREF(exc);
7934 return 0;
7935
7936onError:
7937 Py_XDECREF(item);
7938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
7940 return -1;
7941}
7942
Alexander Belopolsky40018472011-02-26 01:02:56 +00007943PyObject *
7944PyUnicode_DecodeCharmap(const char *s,
7945 Py_ssize_t size,
7946 PyObject *mapping,
7947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007949 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007950
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 /* Default to Latin-1 */
7952 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007956 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007957 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007958 writer.min_length = size;
7959 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007961
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007962 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007963 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7964 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007965 }
7966 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007967 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7968 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007970 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007971
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007973 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 return NULL;
7975}
7976
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977/* Charmap encoding: the lookup table */
7978
Alexander Belopolsky40018472011-02-26 01:02:56 +00007979struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 PyObject_HEAD
7981 unsigned char level1[32];
7982 int count2, count3;
7983 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984};
7985
7986static PyObject*
7987encoding_map_size(PyObject *obj, PyObject* args)
7988{
7989 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992}
7993
7994static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 PyDoc_STR("Return the size (in bytes) of this object") },
7997 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998};
7999
8000static void
8001encoding_map_dealloc(PyObject* o)
8002{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004}
8005
8006static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 "EncodingMap", /*tp_name*/
8009 sizeof(struct encoding_map), /*tp_basicsize*/
8010 0, /*tp_itemsize*/
8011 /* methods */
8012 encoding_map_dealloc, /*tp_dealloc*/
8013 0, /*tp_print*/
8014 0, /*tp_getattr*/
8015 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008016 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 0, /*tp_repr*/
8018 0, /*tp_as_number*/
8019 0, /*tp_as_sequence*/
8020 0, /*tp_as_mapping*/
8021 0, /*tp_hash*/
8022 0, /*tp_call*/
8023 0, /*tp_str*/
8024 0, /*tp_getattro*/
8025 0, /*tp_setattro*/
8026 0, /*tp_as_buffer*/
8027 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8028 0, /*tp_doc*/
8029 0, /*tp_traverse*/
8030 0, /*tp_clear*/
8031 0, /*tp_richcompare*/
8032 0, /*tp_weaklistoffset*/
8033 0, /*tp_iter*/
8034 0, /*tp_iternext*/
8035 encoding_map_methods, /*tp_methods*/
8036 0, /*tp_members*/
8037 0, /*tp_getset*/
8038 0, /*tp_base*/
8039 0, /*tp_dict*/
8040 0, /*tp_descr_get*/
8041 0, /*tp_descr_set*/
8042 0, /*tp_dictoffset*/
8043 0, /*tp_init*/
8044 0, /*tp_alloc*/
8045 0, /*tp_new*/
8046 0, /*tp_free*/
8047 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048};
8049
8050PyObject*
8051PyUnicode_BuildEncodingMap(PyObject* string)
8052{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 PyObject *result;
8054 struct encoding_map *mresult;
8055 int i;
8056 int need_dict = 0;
8057 unsigned char level1[32];
8058 unsigned char level2[512];
8059 unsigned char *mlevel1, *mlevel2, *mlevel3;
8060 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 int kind;
8062 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008063 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008064 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008066 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 PyErr_BadArgument();
8068 return NULL;
8069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 kind = PyUnicode_KIND(string);
8071 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008072 length = PyUnicode_GET_LENGTH(string);
8073 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074 memset(level1, 0xFF, sizeof level1);
8075 memset(level2, 0xFF, sizeof level2);
8076
8077 /* If there isn't a one-to-one mapping of NULL to \0,
8078 or if there are non-BMP characters, we need to use
8079 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008082 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 ch = PyUnicode_READ(kind, data, i);
8085 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 need_dict = 1;
8087 break;
8088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 /* unmapped character */
8091 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 l1 = ch >> 11;
8093 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094 if (level1[l1] == 0xFF)
8095 level1[l1] = count2++;
8096 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 }
8099
8100 if (count2 >= 0xFF || count3 >= 0xFF)
8101 need_dict = 1;
8102
8103 if (need_dict) {
8104 PyObject *result = PyDict_New();
8105 PyObject *key, *value;
8106 if (!result)
8107 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008108 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008110 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111 if (!key || !value)
8112 goto failed1;
8113 if (PyDict_SetItem(result, key, value) == -1)
8114 goto failed1;
8115 Py_DECREF(key);
8116 Py_DECREF(value);
8117 }
8118 return result;
8119 failed1:
8120 Py_XDECREF(key);
8121 Py_XDECREF(value);
8122 Py_DECREF(result);
8123 return NULL;
8124 }
8125
8126 /* Create a three-level trie */
8127 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8128 16*count2 + 128*count3 - 1);
8129 if (!result)
8130 return PyErr_NoMemory();
8131 PyObject_Init(result, &EncodingMapType);
8132 mresult = (struct encoding_map*)result;
8133 mresult->count2 = count2;
8134 mresult->count3 = count3;
8135 mlevel1 = mresult->level1;
8136 mlevel2 = mresult->level23;
8137 mlevel3 = mresult->level23 + 16*count2;
8138 memcpy(mlevel1, level1, 32);
8139 memset(mlevel2, 0xFF, 16*count2);
8140 memset(mlevel3, 0, 128*count3);
8141 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008142 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008144 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8145 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 /* unmapped character */
8147 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008148 o1 = ch>>11;
8149 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 i2 = 16*mlevel1[o1] + o2;
8151 if (mlevel2[i2] == 0xFF)
8152 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008153 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 i3 = 128*mlevel2[i2] + o3;
8155 mlevel3[i3] = i;
8156 }
8157 return result;
8158}
8159
8160static int
Victor Stinner22168992011-11-20 17:09:18 +01008161encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162{
8163 struct encoding_map *map = (struct encoding_map*)mapping;
8164 int l1 = c>>11;
8165 int l2 = (c>>7) & 0xF;
8166 int l3 = c & 0x7F;
8167 int i;
8168
Victor Stinner22168992011-11-20 17:09:18 +01008169 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 if (c == 0)
8172 return 0;
8173 /* level 1*/
8174 i = map->level1[l1];
8175 if (i == 0xFF) {
8176 return -1;
8177 }
8178 /* level 2*/
8179 i = map->level23[16*i+l2];
8180 if (i == 0xFF) {
8181 return -1;
8182 }
8183 /* level 3 */
8184 i = map->level23[16*map->count2 + 128*i + l3];
8185 if (i == 0) {
8186 return -1;
8187 }
8188 return i;
8189}
8190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191/* Lookup the character ch in the mapping. If the character
8192 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008193 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008194static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008195charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196{
Christian Heimes217cfd12007-12-02 14:31:20 +00008197 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 PyObject *x;
8199
8200 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202 x = PyObject_GetItem(mapping, w);
8203 Py_DECREF(w);
8204 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8206 /* No mapping found means: mapping is undefined. */
8207 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008208 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 } else
8210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008212 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008214 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 long value = PyLong_AS_LONG(x);
8216 if (value < 0 || value > 255) {
8217 PyErr_SetString(PyExc_TypeError,
8218 "character mapping must be in range(256)");
8219 Py_DECREF(x);
8220 return NULL;
8221 }
8222 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008224 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 /* wrong return value */
8228 PyErr_Format(PyExc_TypeError,
8229 "character mapping must return integer, bytes or None, not %.400s",
8230 x->ob_type->tp_name);
8231 Py_DECREF(x);
8232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
8234}
8235
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008237charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8240 /* exponentially overallocate to minimize reallocations */
8241 if (requiredsize < 2*outsize)
8242 requiredsize = 2*outsize;
8243 if (_PyBytes_Resize(outobj, requiredsize))
8244 return -1;
8245 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246}
8247
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008250} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008252 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 space is available. Return a new reference to the object that
8254 was put in the output buffer, or Py_None, if the mapping was undefined
8255 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008256 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008257static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008258charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008259 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261 PyObject *rep;
8262 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008263 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264
Christian Heimes90aa7642007-12-19 02:45:37 +00008265 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 if (res == -1)
8269 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 if (outsize<requiredsize)
8271 if (charmapencode_resize(outobj, outpos, requiredsize))
8272 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 outstart[(*outpos)++] = (char)res;
8275 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 }
8277
8278 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 Py_DECREF(rep);
8283 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008284 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 if (PyLong_Check(rep)) {
8286 Py_ssize_t requiredsize = *outpos+1;
8287 if (outsize<requiredsize)
8288 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8289 Py_DECREF(rep);
8290 return enc_EXCEPTION;
8291 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008292 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 else {
8296 const char *repchars = PyBytes_AS_STRING(rep);
8297 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8298 Py_ssize_t requiredsize = *outpos+repsize;
8299 if (outsize<requiredsize)
8300 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8301 Py_DECREF(rep);
8302 return enc_EXCEPTION;
8303 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008304 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 memcpy(outstart + *outpos, repchars, repsize);
8306 *outpos += repsize;
8307 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 Py_DECREF(rep);
8310 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311}
8312
8313/* handle an error in PyUnicode_EncodeCharmap
8314 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008315static int
8316charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008319 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008320 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321{
8322 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008323 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008324 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008325 enum PyUnicode_Kind kind;
8326 void *data;
8327 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008329 Py_ssize_t collstartpos = *inpos;
8330 Py_ssize_t collendpos = *inpos+1;
8331 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008332 const char *encoding = "charmap";
8333 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008335 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008336 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337
Benjamin Petersonbac79492012-01-14 13:34:47 -05008338 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008339 return -1;
8340 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 /* find all unencodable characters */
8342 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008344 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008346 val = encoding_map_lookup(ch, mapping);
8347 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 break;
8349 ++collendpos;
8350 continue;
8351 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008352
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008353 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8354 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 if (rep==NULL)
8356 return -1;
8357 else if (rep!=Py_None) {
8358 Py_DECREF(rep);
8359 break;
8360 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 }
8364 /* cache callback name lookup
8365 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008366 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008367 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008368
8369 switch (*error_handler) {
8370 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008371 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008372 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008373
8374 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008375 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 x = charmapencode_output('?', mapping, res, respos);
8377 if (x==enc_EXCEPTION) {
8378 return -1;
8379 }
8380 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008381 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return -1;
8383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 }
8385 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008386 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 *inpos = collendpos;
8388 break;
Victor Stinner50149202015-09-22 00:26:54 +02008389
8390 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 /* generate replacement (temporarily (mis)uses p) */
8392 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 char buffer[2+29+1+1];
8394 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 for (cp = buffer; *cp; ++cp) {
8397 x = charmapencode_output(*cp, mapping, res, respos);
8398 if (x==enc_EXCEPTION)
8399 return -1;
8400 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008401 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
8403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404 }
8405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 *inpos = collendpos;
8407 break;
Victor Stinner50149202015-09-22 00:26:54 +02008408
Benjamin Peterson14339b62009-01-31 16:36:08 +00008409 default:
Victor Stinner50149202015-09-22 00:26:54 +02008410 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008415 if (PyBytes_Check(repunicode)) {
8416 /* Directly copy bytes result to output. */
8417 Py_ssize_t outsize = PyBytes_Size(*res);
8418 Py_ssize_t requiredsize;
8419 repsize = PyBytes_Size(repunicode);
8420 requiredsize = *respos + repsize;
8421 if (requiredsize > outsize)
8422 /* Make room for all additional bytes. */
8423 if (charmapencode_resize(res, respos, requiredsize)) {
8424 Py_DECREF(repunicode);
8425 return -1;
8426 }
8427 memcpy(PyBytes_AsString(*res) + *respos,
8428 PyBytes_AsString(repunicode), repsize);
8429 *respos += repsize;
8430 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008431 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008432 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008434 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008435 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008436 Py_DECREF(repunicode);
8437 return -1;
8438 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008439 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008440 data = PyUnicode_DATA(repunicode);
8441 kind = PyUnicode_KIND(repunicode);
8442 for (index = 0; index < repsize; index++) {
8443 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8444 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008446 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 return -1;
8448 }
8449 else if (x==enc_FAILED) {
8450 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008451 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return -1;
8453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 }
8455 *inpos = newpos;
8456 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 }
8458 return 0;
8459}
8460
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462_PyUnicode_EncodeCharmap(PyObject *unicode,
8463 PyObject *mapping,
8464 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 /* output object */
8467 PyObject *res = NULL;
8468 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008469 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008470 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008472 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008473 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008475 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008476 void *data;
8477 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
Benjamin Petersonbac79492012-01-14 13:34:47 -05008479 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008480 return NULL;
8481 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008482 data = PyUnicode_DATA(unicode);
8483 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008484
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 /* Default to Latin-1 */
8486 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008487 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 /* allocate enough for a simple encoding without
8490 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008491 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 if (res == NULL)
8493 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008494 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008498 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008500 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 if (x==enc_EXCEPTION) /* error */
8502 goto onError;
8503 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008506 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 &res, &respos)) {
8508 goto onError;
8509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 else
8512 /* done with this character => adjust input position */
8513 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008517 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008518 if (_PyBytes_Resize(&res, respos) < 0)
8519 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008522 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 return res;
8524
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 Py_XDECREF(res);
8527 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008528 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 return NULL;
8530}
8531
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532/* Deprecated */
8533PyObject *
8534PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8535 Py_ssize_t size,
8536 PyObject *mapping,
8537 const char *errors)
8538{
8539 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008540 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 if (unicode == NULL)
8542 return NULL;
8543 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8544 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008545 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548PyObject *
8549PyUnicode_AsCharmapString(PyObject *unicode,
8550 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
8552 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 PyErr_BadArgument();
8554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557}
8558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008560static void
8561make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563 Py_ssize_t startpos, Py_ssize_t endpos,
8564 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 *exceptionObject = _PyUnicodeTranslateError_Create(
8568 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 }
8570 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8572 goto onError;
8573 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8574 goto onError;
8575 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8576 goto onError;
8577 return;
8578 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008579 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
8581}
8582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583/* error handling callback helper:
8584 build arguments, call the callback and check the arguments,
8585 put the result into newpos and return the replacement string, which
8586 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008587static PyObject *
8588unicode_translate_call_errorhandler(const char *errors,
8589 PyObject **errorHandler,
8590 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592 Py_ssize_t startpos, Py_ssize_t endpos,
8593 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008595 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008597 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 PyObject *restuple;
8599 PyObject *resunicode;
8600
8601 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605 }
8606
8607 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008612 restuple = PyObject_CallFunctionObjArgs(
8613 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008617 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 Py_DECREF(restuple);
8619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008621 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 &resunicode, &i_newpos)) {
8623 Py_DECREF(restuple);
8624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008626 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008628 else
8629 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008631 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 Py_DECREF(restuple);
8633 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 Py_INCREF(resunicode);
8636 Py_DECREF(restuple);
8637 return resunicode;
8638}
8639
8640/* Lookup the character ch in the mapping and put the result in result,
8641 which must be decrefed by the caller.
8642 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645{
Christian Heimes217cfd12007-12-02 14:31:20 +00008646 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 PyObject *x;
8648
8649 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 x = PyObject_GetItem(mapping, w);
8652 Py_DECREF(w);
8653 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8655 /* No mapping found means: use 1:1 mapping. */
8656 PyErr_Clear();
8657 *result = NULL;
8658 return 0;
8659 } else
8660 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 }
8662 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 *result = x;
8664 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008666 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008668 if (value < 0 || value > MAX_UNICODE) {
8669 PyErr_Format(PyExc_ValueError,
8670 "character mapping must be in range(0x%x)",
8671 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 Py_DECREF(x);
8673 return -1;
8674 }
8675 *result = x;
8676 return 0;
8677 }
8678 else if (PyUnicode_Check(x)) {
8679 *result = x;
8680 return 0;
8681 }
8682 else {
8683 /* wrong return value */
8684 PyErr_SetString(PyExc_TypeError,
8685 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 Py_DECREF(x);
8687 return -1;
8688 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689}
Victor Stinner1194ea02014-04-04 19:37:40 +02008690
8691/* lookup the character, write the result into the writer.
8692 Return 1 if the result was written into the writer, return 0 if the mapping
8693 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008695charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8696 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697{
Victor Stinner1194ea02014-04-04 19:37:40 +02008698 PyObject *item;
8699
8700 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008702
8703 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008705 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008708 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008710
8711 if (item == Py_None) {
8712 Py_DECREF(item);
8713 return 0;
8714 }
8715
8716 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008717 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8718 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8719 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008720 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8721 Py_DECREF(item);
8722 return -1;
8723 }
8724 Py_DECREF(item);
8725 return 1;
8726 }
8727
8728 if (!PyUnicode_Check(item)) {
8729 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008731 }
8732
8733 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8734 Py_DECREF(item);
8735 return -1;
8736 }
8737
8738 Py_DECREF(item);
8739 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740}
8741
Victor Stinner89a76ab2014-04-05 11:44:04 +02008742static int
8743unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8744 Py_UCS1 *translate)
8745{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008746 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008747 int ret = 0;
8748
Victor Stinner89a76ab2014-04-05 11:44:04 +02008749 if (charmaptranslate_lookup(ch, mapping, &item)) {
8750 return -1;
8751 }
8752
8753 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008754 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008755 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008756 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008757 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008758 /* not found => default to 1:1 mapping */
8759 translate[ch] = ch;
8760 return 1;
8761 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008762 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008763 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008764 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8765 used it */
8766 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008767 /* invalid character or character outside ASCII:
8768 skip the fast translate */
8769 goto exit;
8770 }
8771 translate[ch] = (Py_UCS1)replace;
8772 }
8773 else if (PyUnicode_Check(item)) {
8774 Py_UCS4 replace;
8775
8776 if (PyUnicode_READY(item) == -1) {
8777 Py_DECREF(item);
8778 return -1;
8779 }
8780 if (PyUnicode_GET_LENGTH(item) != 1)
8781 goto exit;
8782
8783 replace = PyUnicode_READ_CHAR(item, 0);
8784 if (replace > 127)
8785 goto exit;
8786 translate[ch] = (Py_UCS1)replace;
8787 }
8788 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008789 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008790 goto exit;
8791 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008792 ret = 1;
8793
Benjamin Peterson1365de72014-04-07 20:15:41 -04008794 exit:
8795 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008796 return ret;
8797}
8798
8799/* Fast path for ascii => ascii translation. Return 1 if the whole string
8800 was translated into writer, return 0 if the input string was partially
8801 translated into writer, raise an exception and return -1 on error. */
8802static int
8803unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008804 _PyUnicodeWriter *writer, int ignore,
8805 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008806{
Victor Stinner872b2912014-04-05 14:27:07 +02008807 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808 Py_ssize_t len;
8809 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008810 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811
Victor Stinner89a76ab2014-04-05 11:44:04 +02008812 len = PyUnicode_GET_LENGTH(input);
8813
Victor Stinner872b2912014-04-05 14:27:07 +02008814 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815
8816 in = PyUnicode_1BYTE_DATA(input);
8817 end = in + len;
8818
8819 assert(PyUnicode_IS_ASCII(writer->buffer));
8820 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8821 out = PyUnicode_1BYTE_DATA(writer->buffer);
8822
Victor Stinner872b2912014-04-05 14:27:07 +02008823 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008825 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008826 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008827 int translate = unicode_fast_translate_lookup(mapping, ch,
8828 ascii_table);
8829 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008831 if (translate == 0)
8832 goto exit;
8833 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 }
Victor Stinner872b2912014-04-05 14:27:07 +02008835 if (ch2 == 0xfe) {
8836 if (ignore)
8837 continue;
8838 goto exit;
8839 }
8840 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008842 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 }
Victor Stinner872b2912014-04-05 14:27:07 +02008844 res = 1;
8845
8846exit:
8847 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008848 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008849 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850}
8851
Victor Stinner3222da22015-10-01 22:07:32 +02008852static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853_PyUnicode_TranslateCharmap(PyObject *input,
8854 PyObject *mapping,
8855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008858 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 Py_ssize_t size, i;
8860 int kind;
8861 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008862 _PyUnicodeWriter writer;
8863 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008864 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008865 PyObject *errorHandler = NULL;
8866 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008867 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 PyErr_BadArgument();
8872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 if (PyUnicode_READY(input) == -1)
8876 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008877 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 kind = PyUnicode_KIND(input);
8879 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008881 if (size == 0)
8882 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008884 /* allocate enough for a simple 1:1 translation without
8885 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 _PyUnicodeWriter_Init(&writer);
8887 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8891
Victor Stinner33798672016-03-01 21:59:58 +01008892 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008894 if (PyUnicode_IS_ASCII(input)) {
8895 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8896 if (res < 0) {
8897 _PyUnicodeWriter_Dealloc(&writer);
8898 return NULL;
8899 }
8900 if (res == 1)
8901 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 }
Victor Stinner33798672016-03-01 21:59:58 +01008903 else {
8904 i = 0;
8905 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 int translate;
8910 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8911 Py_ssize_t newpos;
8912 /* startpos for collecting untranslatable chars */
8913 Py_ssize_t collstart;
8914 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008915 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 ch = PyUnicode_READ(kind, data, i);
8918 translate = charmaptranslate_output(ch, mapping, &writer);
8919 if (translate < 0)
8920 goto onError;
8921
8922 if (translate != 0) {
8923 /* it worked => adjust input pointer */
8924 ++i;
8925 continue;
8926 }
8927
8928 /* untranslatable character */
8929 collstart = i;
8930 collend = i+1;
8931
8932 /* find all untranslatable characters */
8933 while (collend < size) {
8934 PyObject *x;
8935 ch = PyUnicode_READ(kind, data, collend);
8936 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008937 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 Py_XDECREF(x);
8939 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008941 ++collend;
8942 }
8943
8944 if (ignore) {
8945 i = collend;
8946 }
8947 else {
8948 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8949 reason, input, &exc,
8950 collstart, collend, &newpos);
8951 if (repunicode == NULL)
8952 goto onError;
8953 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008956 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008957 Py_DECREF(repunicode);
8958 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008959 }
8960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 Py_XDECREF(exc);
8962 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008967 Py_XDECREF(exc);
8968 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 return NULL;
8970}
8971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972/* Deprecated. Use PyUnicode_Translate instead. */
8973PyObject *
8974PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8975 Py_ssize_t size,
8976 PyObject *mapping,
8977 const char *errors)
8978{
Christian Heimes5f520f42012-09-11 14:03:25 +02008979 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008980 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 if (!unicode)
8982 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008983 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8984 Py_DECREF(unicode);
8985 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986}
8987
Alexander Belopolsky40018472011-02-26 01:02:56 +00008988PyObject *
8989PyUnicode_Translate(PyObject *str,
8990 PyObject *mapping,
8991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008993 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02008994 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008995 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996}
Tim Petersced69f82003-09-16 20:30:58 +00008997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998PyObject *
8999_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9000{
9001 if (!PyUnicode_Check(unicode)) {
9002 PyErr_BadInternalCall();
9003 return NULL;
9004 }
9005 if (PyUnicode_READY(unicode) == -1)
9006 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009007 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 /* If the string is already ASCII, just return the same string */
9009 Py_INCREF(unicode);
9010 return unicode;
9011 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009012
9013 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9014 PyObject *result = PyUnicode_New(len, 127);
9015 if (result == NULL) {
9016 return NULL;
9017 }
9018
9019 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9020 int kind = PyUnicode_KIND(unicode);
9021 const void *data = PyUnicode_DATA(unicode);
9022 Py_ssize_t i;
9023 for (i = 0; i < len; ++i) {
9024 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9025 if (ch < 127) {
9026 out[i] = ch;
9027 }
9028 else if (Py_UNICODE_ISSPACE(ch)) {
9029 out[i] = ' ';
9030 }
9031 else {
9032 int decimal = Py_UNICODE_TODECIMAL(ch);
9033 if (decimal < 0) {
9034 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009035 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009036 _PyUnicode_LENGTH(result) = i + 1;
9037 break;
9038 }
9039 out[i] = '0' + decimal;
9040 }
9041 }
9042
INADA Naoki16dfca42018-07-14 12:06:43 +09009043 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009044 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045}
9046
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009047PyObject *
9048PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9049 Py_ssize_t length)
9050{
Victor Stinnerf0124502011-11-21 23:12:56 +01009051 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009052 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009053 Py_UCS4 maxchar;
9054 enum PyUnicode_Kind kind;
9055 void *data;
9056
Victor Stinner99d7ad02012-02-22 13:37:39 +01009057 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009058 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009059 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009060 if (ch > 127) {
9061 int decimal = Py_UNICODE_TODECIMAL(ch);
9062 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009063 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009064 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009065 }
9066 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009067
9068 /* Copy to a new string */
9069 decimal = PyUnicode_New(length, maxchar);
9070 if (decimal == NULL)
9071 return decimal;
9072 kind = PyUnicode_KIND(decimal);
9073 data = PyUnicode_DATA(decimal);
9074 /* Iterate over code points */
9075 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009076 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009077 if (ch > 127) {
9078 int decimal = Py_UNICODE_TODECIMAL(ch);
9079 if (decimal >= 0)
9080 ch = '0' + decimal;
9081 }
9082 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009084 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009085}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009086/* --- Decimal Encoder ---------------------------------------------------- */
9087
Alexander Belopolsky40018472011-02-26 01:02:56 +00009088int
9089PyUnicode_EncodeDecimal(Py_UNICODE *s,
9090 Py_ssize_t length,
9091 char *output,
9092 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009093{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009094 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009095 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009096 enum PyUnicode_Kind kind;
9097 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009098
9099 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 PyErr_BadArgument();
9101 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009102 }
9103
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009104 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009105 if (unicode == NULL)
9106 return -1;
9107
Victor Stinner42bf7752011-11-21 22:52:58 +01009108 kind = PyUnicode_KIND(unicode);
9109 data = PyUnicode_DATA(unicode);
9110
Victor Stinnerb84d7232011-11-22 01:50:07 +01009111 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009112 PyObject *exc;
9113 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009115 Py_ssize_t startpos;
9116
9117 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009118
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009121 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 decimal = Py_UNICODE_TODECIMAL(ch);
9125 if (decimal >= 0) {
9126 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009127 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 continue;
9129 }
9130 if (0 < ch && ch < 256) {
9131 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009132 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 continue;
9134 }
Victor Stinner6345be92011-11-25 20:09:01 +01009135
Victor Stinner42bf7752011-11-21 22:52:58 +01009136 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009137 exc = NULL;
9138 raise_encode_exception(&exc, "decimal", unicode,
9139 startpos, startpos+1,
9140 "invalid decimal Unicode string");
9141 Py_XDECREF(exc);
9142 Py_DECREF(unicode);
9143 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009144 }
9145 /* 0-terminate the output string */
9146 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009147 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009148 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009149}
9150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151/* --- Helpers ------------------------------------------------------------ */
9152
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009153/* helper macro to fixup start/end slice values */
9154#define ADJUST_INDICES(start, end, len) \
9155 if (end > len) \
9156 end = len; \
9157 else if (end < 0) { \
9158 end += len; \
9159 if (end < 0) \
9160 end = 0; \
9161 } \
9162 if (start < 0) { \
9163 start += len; \
9164 if (start < 0) \
9165 start = 0; \
9166 }
9167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009169any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009171 Py_ssize_t end,
9172 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009174 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 void *buf1, *buf2;
9176 Py_ssize_t len1, len2, result;
9177
9178 kind1 = PyUnicode_KIND(s1);
9179 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009180 if (kind1 < kind2)
9181 return -1;
9182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 len1 = PyUnicode_GET_LENGTH(s1);
9184 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009185 ADJUST_INDICES(start, end, len1);
9186 if (end - start < len2)
9187 return -1;
9188
9189 buf1 = PyUnicode_DATA(s1);
9190 buf2 = PyUnicode_DATA(s2);
9191 if (len2 == 1) {
9192 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9193 result = findchar((const char *)buf1 + kind1*start,
9194 kind1, end - start, ch, direction);
9195 if (result == -1)
9196 return -1;
9197 else
9198 return start + result;
9199 }
9200
9201 if (kind2 != kind1) {
9202 buf2 = _PyUnicode_AsKind(s2, kind1);
9203 if (!buf2)
9204 return -2;
9205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206
Victor Stinner794d5672011-10-10 03:21:36 +02009207 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009208 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009209 case PyUnicode_1BYTE_KIND:
9210 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9211 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9212 else
9213 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9214 break;
9215 case PyUnicode_2BYTE_KIND:
9216 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9217 break;
9218 case PyUnicode_4BYTE_KIND:
9219 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009222 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009223 }
9224 }
9225 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009226 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009227 case PyUnicode_1BYTE_KIND:
9228 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9229 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9230 else
9231 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9232 break;
9233 case PyUnicode_2BYTE_KIND:
9234 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9235 break;
9236 case PyUnicode_4BYTE_KIND:
9237 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009240 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 }
9243
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009244 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 PyMem_Free(buf2);
9246
9247 return result;
9248}
9249
Victor Stinner59423e32018-11-26 13:40:01 +01009250/* _PyUnicode_InsertThousandsGrouping() helper functions */
9251#include "stringlib/localeutil.h"
9252
9253/**
9254 * InsertThousandsGrouping:
9255 * @writer: Unicode writer.
9256 * @n_buffer: Number of characters in @buffer.
9257 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9258 * @d_pos: Start of digits string.
9259 * @n_digits: The number of digits in the string, in which we want
9260 * to put the grouping chars.
9261 * @min_width: The minimum width of the digits in the output string.
9262 * Output will be zero-padded on the left to fill.
9263 * @grouping: see definition in localeconv().
9264 * @thousands_sep: see definition in localeconv().
9265 *
9266 * There are 2 modes: counting and filling. If @writer is NULL,
9267 * we are in counting mode, else filling mode.
9268 * If counting, the required buffer size is returned.
9269 * If filling, we know the buffer will be large enough, so we don't
9270 * need to pass in the buffer size.
9271 * Inserts thousand grouping characters (as defined by grouping and
9272 * thousands_sep) into @writer.
9273 *
9274 * Return value: -1 on error, number of characters otherwise.
9275 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009277_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009278 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009279 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009280 PyObject *digits,
9281 Py_ssize_t d_pos,
9282 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009283 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009284 const char *grouping,
9285 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009286 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287{
Xtreak3f7983a2019-01-07 20:39:14 +05309288 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009289 if (writer) {
9290 assert(digits != NULL);
9291 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009292 }
9293 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009294 assert(digits == NULL);
9295 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009296 }
Victor Stinner59423e32018-11-26 13:40:01 +01009297 assert(0 <= d_pos);
9298 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009299 assert(grouping != NULL);
9300
9301 if (digits != NULL) {
9302 if (PyUnicode_READY(digits) == -1) {
9303 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009304 }
Victor Stinner59423e32018-11-26 13:40:01 +01009305 }
9306 if (PyUnicode_READY(thousands_sep) == -1) {
9307 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 }
9309
Victor Stinner59423e32018-11-26 13:40:01 +01009310 Py_ssize_t count = 0;
9311 Py_ssize_t n_zeros;
9312 int loop_broken = 0;
9313 int use_separator = 0; /* First time through, don't append the
9314 separator. They only go between
9315 groups. */
9316 Py_ssize_t buffer_pos;
9317 Py_ssize_t digits_pos;
9318 Py_ssize_t len;
9319 Py_ssize_t n_chars;
9320 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9321 be looked at */
9322 /* A generator that returns all of the grouping widths, until it
9323 returns 0. */
9324 GroupGenerator groupgen;
9325 GroupGenerator_init(&groupgen, grouping);
9326 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9327
9328 /* if digits are not grouped, thousands separator
9329 should be an empty string */
9330 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9331
9332 digits_pos = d_pos + n_digits;
9333 if (writer) {
9334 buffer_pos = writer->pos + n_buffer;
9335 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9336 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 }
Victor Stinner59423e32018-11-26 13:40:01 +01009338 else {
9339 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009340 }
Victor Stinner59423e32018-11-26 13:40:01 +01009341
9342 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009343 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 }
Victor Stinner59423e32018-11-26 13:40:01 +01009345
9346 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9347 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9348 n_zeros = Py_MAX(0, len - remaining);
9349 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9350
9351 /* Use n_zero zero's and n_chars chars */
9352
9353 /* Count only, don't do anything. */
9354 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9355
9356 /* Copy into the writer. */
9357 InsertThousandsGrouping_fill(writer, &buffer_pos,
9358 digits, &digits_pos,
9359 n_chars, n_zeros,
9360 use_separator ? thousands_sep : NULL,
9361 thousands_sep_len, maxchar);
9362
9363 /* Use a separator next time. */
9364 use_separator = 1;
9365
9366 remaining -= n_chars;
9367 min_width -= len;
9368
9369 if (remaining <= 0 && min_width <= 0) {
9370 loop_broken = 1;
9371 break;
9372 }
9373 min_width -= thousands_sep_len;
9374 }
9375 if (!loop_broken) {
9376 /* We left the loop without using a break statement. */
9377
9378 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9379 n_zeros = Py_MAX(0, len - remaining);
9380 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9381
9382 /* Use n_zero zero's and n_chars chars */
9383 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9384
9385 /* Copy into the writer. */
9386 InsertThousandsGrouping_fill(writer, &buffer_pos,
9387 digits, &digits_pos,
9388 n_chars, n_zeros,
9389 use_separator ? thousands_sep : NULL,
9390 thousands_sep_len, maxchar);
9391 }
9392 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393}
9394
9395
Alexander Belopolsky40018472011-02-26 01:02:56 +00009396Py_ssize_t
9397PyUnicode_Count(PyObject *str,
9398 PyObject *substr,
9399 Py_ssize_t start,
9400 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009402 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009403 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 void *buf1 = NULL, *buf2 = NULL;
9405 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009406
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009407 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009409
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009410 kind1 = PyUnicode_KIND(str);
9411 kind2 = PyUnicode_KIND(substr);
9412 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 len1 = PyUnicode_GET_LENGTH(str);
9416 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009419 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009421 buf1 = PyUnicode_DATA(str);
9422 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425 if (!buf2)
9426 goto onError;
9427 }
9428
9429 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009432 result = asciilib_count(
9433 ((Py_UCS1*)buf1) + start, end - start,
9434 buf2, len2, PY_SSIZE_T_MAX
9435 );
9436 else
9437 result = ucs1lib_count(
9438 ((Py_UCS1*)buf1) + start, end - start,
9439 buf2, len2, PY_SSIZE_T_MAX
9440 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 break;
9442 case PyUnicode_2BYTE_KIND:
9443 result = ucs2lib_count(
9444 ((Py_UCS2*)buf1) + start, end - start,
9445 buf2, len2, PY_SSIZE_T_MAX
9446 );
9447 break;
9448 case PyUnicode_4BYTE_KIND:
9449 result = ucs4lib_count(
9450 ((Py_UCS4*)buf1) + start, end - start,
9451 buf2, len2, PY_SSIZE_T_MAX
9452 );
9453 break;
9454 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009455 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009458 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 PyMem_Free(buf2);
9460
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009463 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 PyMem_Free(buf2);
9465 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466}
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468Py_ssize_t
9469PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009470 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471 Py_ssize_t start,
9472 Py_ssize_t end,
9473 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009475 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009477
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481Py_ssize_t
9482PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9483 Py_ssize_t start, Py_ssize_t end,
9484 int direction)
9485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009487 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 if (PyUnicode_READY(str) == -1)
9489 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009490 len = PyUnicode_GET_LENGTH(str);
9491 ADJUST_INDICES(start, end, len);
9492 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009493 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009495 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9496 kind, end-start, ch, direction);
9497 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009499 else
9500 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501}
9502
Alexander Belopolsky40018472011-02-26 01:02:56 +00009503static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009504tailmatch(PyObject *self,
9505 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506 Py_ssize_t start,
9507 Py_ssize_t end,
9508 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 int kind_self;
9511 int kind_sub;
9512 void *data_self;
9513 void *data_sub;
9514 Py_ssize_t offset;
9515 Py_ssize_t i;
9516 Py_ssize_t end_sub;
9517
9518 if (PyUnicode_READY(self) == -1 ||
9519 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009520 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9523 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009527 if (PyUnicode_GET_LENGTH(substring) == 0)
9528 return 1;
9529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 kind_self = PyUnicode_KIND(self);
9531 data_self = PyUnicode_DATA(self);
9532 kind_sub = PyUnicode_KIND(substring);
9533 data_sub = PyUnicode_DATA(substring);
9534 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9535
9536 if (direction > 0)
9537 offset = end;
9538 else
9539 offset = start;
9540
9541 if (PyUnicode_READ(kind_self, data_self, offset) ==
9542 PyUnicode_READ(kind_sub, data_sub, 0) &&
9543 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9544 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9545 /* If both are of the same kind, memcmp is sufficient */
9546 if (kind_self == kind_sub) {
9547 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009548 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 data_sub,
9550 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009551 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009553 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 else {
9555 /* We do not need to compare 0 and len(substring)-1 because
9556 the if statement above ensured already that they are equal
9557 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 for (i = 1; i < end_sub; ++i) {
9559 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9560 PyUnicode_READ(kind_sub, data_sub, i))
9561 return 0;
9562 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 }
9566
9567 return 0;
9568}
9569
Alexander Belopolsky40018472011-02-26 01:02:56 +00009570Py_ssize_t
9571PyUnicode_Tailmatch(PyObject *str,
9572 PyObject *substr,
9573 Py_ssize_t start,
9574 Py_ssize_t end,
9575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009583static PyObject *
9584ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009586 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9587 char *resdata, *data = PyUnicode_DATA(self);
9588 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009589
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009590 res = PyUnicode_New(len, 127);
9591 if (res == NULL)
9592 return NULL;
9593 resdata = PyUnicode_DATA(res);
9594 if (lower)
9595 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009597 _Py_bytes_upper(resdata, data, len);
9598 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009602handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009604 Py_ssize_t j;
9605 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009606 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009607 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009608
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009609 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9610
9611 where ! is a negation and \p{xxx} is a character with property xxx.
9612 */
9613 for (j = i - 1; j >= 0; j--) {
9614 c = PyUnicode_READ(kind, data, j);
9615 if (!_PyUnicode_IsCaseIgnorable(c))
9616 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9619 if (final_sigma) {
9620 for (j = i + 1; j < length; j++) {
9621 c = PyUnicode_READ(kind, data, j);
9622 if (!_PyUnicode_IsCaseIgnorable(c))
9623 break;
9624 }
9625 final_sigma = j == length || !_PyUnicode_IsCased(c);
9626 }
9627 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628}
9629
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630static int
9631lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9632 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009634 /* Obscure special case. */
9635 if (c == 0x3A3) {
9636 mapped[0] = handle_capital_sigma(kind, data, length, i);
9637 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640}
9641
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642static Py_ssize_t
9643do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 Py_ssize_t i, k = 0;
9646 int n_res, j;
9647 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009648
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649 c = PyUnicode_READ(kind, data, 0);
9650 n_res = _PyUnicode_ToUpperFull(c, mapped);
9651 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009652 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 for (i = 1; i < length; i++) {
9656 c = PyUnicode_READ(kind, data, i);
9657 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9658 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009659 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009661 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009662 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664}
9665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666static Py_ssize_t
9667do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9668 Py_ssize_t i, k = 0;
9669
9670 for (i = 0; i < length; i++) {
9671 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9672 int n_res, j;
9673 if (Py_UNICODE_ISUPPER(c)) {
9674 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9675 }
9676 else if (Py_UNICODE_ISLOWER(c)) {
9677 n_res = _PyUnicode_ToUpperFull(c, mapped);
9678 }
9679 else {
9680 n_res = 1;
9681 mapped[0] = c;
9682 }
9683 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009684 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 res[k++] = mapped[j];
9686 }
9687 }
9688 return k;
9689}
9690
9691static Py_ssize_t
9692do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9693 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 Py_ssize_t i, k = 0;
9696
9697 for (i = 0; i < length; i++) {
9698 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9699 int n_res, j;
9700 if (lower)
9701 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9702 else
9703 n_res = _PyUnicode_ToUpperFull(c, mapped);
9704 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009705 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 res[k++] = mapped[j];
9707 }
9708 }
9709 return k;
9710}
9711
9712static Py_ssize_t
9713do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9714{
9715 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9716}
9717
9718static Py_ssize_t
9719do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9720{
9721 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9722}
9723
Benjamin Petersone51757f2012-01-12 21:10:29 -05009724static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009725do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9726{
9727 Py_ssize_t i, k = 0;
9728
9729 for (i = 0; i < length; i++) {
9730 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9731 Py_UCS4 mapped[3];
9732 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9733 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009734 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009735 res[k++] = mapped[j];
9736 }
9737 }
9738 return k;
9739}
9740
9741static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009742do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9743{
9744 Py_ssize_t i, k = 0;
9745 int previous_is_cased;
9746
9747 previous_is_cased = 0;
9748 for (i = 0; i < length; i++) {
9749 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9750 Py_UCS4 mapped[3];
9751 int n_res, j;
9752
9753 if (previous_is_cased)
9754 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9755 else
9756 n_res = _PyUnicode_ToTitleFull(c, mapped);
9757
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009760 res[k++] = mapped[j];
9761 }
9762
9763 previous_is_cased = _PyUnicode_IsCased(c);
9764 }
9765 return k;
9766}
9767
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768static PyObject *
9769case_operation(PyObject *self,
9770 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9771{
9772 PyObject *res = NULL;
9773 Py_ssize_t length, newlength = 0;
9774 int kind, outkind;
9775 void *data, *outdata;
9776 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9777
Benjamin Petersoneea48462012-01-16 14:28:50 -05009778 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779
9780 kind = PyUnicode_KIND(self);
9781 data = PyUnicode_DATA(self);
9782 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009783 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009784 PyErr_SetString(PyExc_OverflowError, "string is too long");
9785 return NULL;
9786 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009787 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788 if (tmp == NULL)
9789 return PyErr_NoMemory();
9790 newlength = perform(kind, data, length, tmp, &maxchar);
9791 res = PyUnicode_New(newlength, maxchar);
9792 if (res == NULL)
9793 goto leave;
9794 tmpend = tmp + newlength;
9795 outdata = PyUnicode_DATA(res);
9796 outkind = PyUnicode_KIND(res);
9797 switch (outkind) {
9798 case PyUnicode_1BYTE_KIND:
9799 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9800 break;
9801 case PyUnicode_2BYTE_KIND:
9802 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9803 break;
9804 case PyUnicode_4BYTE_KIND:
9805 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9806 break;
9807 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009808 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809 }
9810 leave:
9811 PyMem_FREE(tmp);
9812 return res;
9813}
9814
Tim Peters8ce9f162004-08-27 01:49:32 +00009815PyObject *
9816PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009818 PyObject *res;
9819 PyObject *fseq;
9820 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009821 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009823 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009824 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009825 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009826 }
9827
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009828 /* NOTE: the following code can't call back into Python code,
9829 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009830 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009831
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009832 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009833 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009834 res = _PyUnicode_JoinArray(separator, items, seqlen);
9835 Py_DECREF(fseq);
9836 return res;
9837}
9838
9839PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009840_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009841{
9842 PyObject *res = NULL; /* the result */
9843 PyObject *sep = NULL;
9844 Py_ssize_t seplen;
9845 PyObject *item;
9846 Py_ssize_t sz, i, res_offset;
9847 Py_UCS4 maxchar;
9848 Py_UCS4 item_maxchar;
9849 int use_memcpy;
9850 unsigned char *res_data = NULL, *sep_data = NULL;
9851 PyObject *last_obj;
9852 unsigned int kind = 0;
9853
Tim Peters05eba1f2004-08-27 21:32:02 +00009854 /* If empty sequence, return u"". */
9855 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009856 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009858
Tim Peters05eba1f2004-08-27 21:32:02 +00009859 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009861 if (seqlen == 1) {
9862 if (PyUnicode_CheckExact(items[0])) {
9863 res = items[0];
9864 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009865 return res;
9866 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009867 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009868 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009869 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009870 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009871 /* Set up sep and seplen */
9872 if (separator == NULL) {
9873 /* fall back to a blank space separator */
9874 sep = PyUnicode_FromOrdinal(' ');
9875 if (!sep)
9876 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009877 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009878 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009879 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009880 else {
9881 if (!PyUnicode_Check(separator)) {
9882 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009883 "separator: expected str instance,"
9884 " %.80s found",
9885 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009886 goto onError;
9887 }
9888 if (PyUnicode_READY(separator))
9889 goto onError;
9890 sep = separator;
9891 seplen = PyUnicode_GET_LENGTH(separator);
9892 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9893 /* inc refcount to keep this code path symmetric with the
9894 above case of a blank separator */
9895 Py_INCREF(sep);
9896 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009897 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009898 }
9899
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009900 /* There are at least two things to join, or else we have a subclass
9901 * of str in the sequence.
9902 * Do a pre-pass to figure out the total amount of space we'll
9903 * need (sz), and see whether all argument are strings.
9904 */
9905 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009906#ifdef Py_DEBUG
9907 use_memcpy = 0;
9908#else
9909 use_memcpy = 1;
9910#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009911 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009912 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009913 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 if (!PyUnicode_Check(item)) {
9915 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009916 "sequence item %zd: expected str instance,"
9917 " %.80s found",
9918 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 goto onError;
9920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (PyUnicode_READY(item) == -1)
9922 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009923 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009925 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009926 if (i != 0) {
9927 add_sz += seplen;
9928 }
9929 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009930 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009932 goto onError;
9933 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009934 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009935 if (use_memcpy && last_obj != NULL) {
9936 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9937 use_memcpy = 0;
9938 }
9939 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009940 }
Tim Petersced69f82003-09-16 20:30:58 +00009941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009943 if (res == NULL)
9944 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009945
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009946 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009947#ifdef Py_DEBUG
9948 use_memcpy = 0;
9949#else
9950 if (use_memcpy) {
9951 res_data = PyUnicode_1BYTE_DATA(res);
9952 kind = PyUnicode_KIND(res);
9953 if (seplen != 0)
9954 sep_data = PyUnicode_1BYTE_DATA(sep);
9955 }
9956#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009957 if (use_memcpy) {
9958 for (i = 0; i < seqlen; ++i) {
9959 Py_ssize_t itemlen;
9960 item = items[i];
9961
9962 /* Copy item, and maybe the separator. */
9963 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009964 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009965 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009966 kind * seplen);
9967 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009969
9970 itemlen = PyUnicode_GET_LENGTH(item);
9971 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009972 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009973 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 kind * itemlen);
9975 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009976 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009977 }
9978 assert(res_data == PyUnicode_1BYTE_DATA(res)
9979 + kind * PyUnicode_GET_LENGTH(res));
9980 }
9981 else {
9982 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9983 Py_ssize_t itemlen;
9984 item = items[i];
9985
9986 /* Copy item, and maybe the separator. */
9987 if (i && seplen != 0) {
9988 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9989 res_offset += seplen;
9990 }
9991
9992 itemlen = PyUnicode_GET_LENGTH(item);
9993 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009994 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009995 res_offset += itemlen;
9996 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009997 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009998 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009999 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010002 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010007 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 return NULL;
10009}
10010
Victor Stinnerd3f08822012-05-29 12:57:52 +020010011void
10012_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10013 Py_UCS4 fill_char)
10014{
10015 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010016 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010017 assert(PyUnicode_IS_READY(unicode));
10018 assert(unicode_modifiable(unicode));
10019 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10020 assert(start >= 0);
10021 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010022 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010023}
10024
Victor Stinner3fe55312012-01-04 00:33:50 +010010025Py_ssize_t
10026PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10027 Py_UCS4 fill_char)
10028{
10029 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010030
10031 if (!PyUnicode_Check(unicode)) {
10032 PyErr_BadInternalCall();
10033 return -1;
10034 }
10035 if (PyUnicode_READY(unicode) == -1)
10036 return -1;
10037 if (unicode_check_modifiable(unicode))
10038 return -1;
10039
Victor Stinnerd3f08822012-05-29 12:57:52 +020010040 if (start < 0) {
10041 PyErr_SetString(PyExc_IndexError, "string index out of range");
10042 return -1;
10043 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010044 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10045 PyErr_SetString(PyExc_ValueError,
10046 "fill character is bigger than "
10047 "the string maximum character");
10048 return -1;
10049 }
10050
10051 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10052 length = Py_MIN(maxlen, length);
10053 if (length <= 0)
10054 return 0;
10055
Victor Stinnerd3f08822012-05-29 12:57:52 +020010056 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010057 return length;
10058}
10059
Victor Stinner9310abb2011-10-05 00:59:23 +020010060static PyObject *
10061pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010062 Py_ssize_t left,
10063 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 PyObject *u;
10067 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010068 int kind;
10069 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
10071 if (left < 0)
10072 left = 0;
10073 if (right < 0)
10074 right = 0;
10075
Victor Stinnerc4b49542011-12-11 22:44:26 +010010076 if (left == 0 && right == 0)
10077 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10080 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010081 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10082 return NULL;
10083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010085 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010087 if (!u)
10088 return NULL;
10089
10090 kind = PyUnicode_KIND(u);
10091 data = PyUnicode_DATA(u);
10092 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010093 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010094 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010095 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010096 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010097 assert(_PyUnicode_CheckConsistency(u, 1));
10098 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099}
10100
Alexander Belopolsky40018472011-02-26 01:02:56 +000010101PyObject *
10102PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010106 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108
Benjamin Petersonead6b532011-12-20 17:23:42 -060010109 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010111 if (PyUnicode_IS_ASCII(string))
10112 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010113 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010114 PyUnicode_GET_LENGTH(string), keepends);
10115 else
10116 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010117 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010118 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 break;
10120 case PyUnicode_2BYTE_KIND:
10121 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010122 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 PyUnicode_GET_LENGTH(string), keepends);
10124 break;
10125 case PyUnicode_4BYTE_KIND:
10126 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010127 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 PyUnicode_GET_LENGTH(string), keepends);
10129 break;
10130 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010131 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134}
10135
Alexander Belopolsky40018472011-02-26 01:02:56 +000010136static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010137split(PyObject *self,
10138 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010139 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010141 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 void *buf1, *buf2;
10143 Py_ssize_t len1, len2;
10144 PyObject* out;
10145
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010147 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (PyUnicode_READY(self) == -1)
10150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010153 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010155 if (PyUnicode_IS_ASCII(self))
10156 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010158 PyUnicode_GET_LENGTH(self), maxcount
10159 );
10160 else
10161 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010163 PyUnicode_GET_LENGTH(self), maxcount
10164 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 case PyUnicode_2BYTE_KIND:
10166 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 PyUnicode_GET_LENGTH(self), maxcount
10169 );
10170 case PyUnicode_4BYTE_KIND:
10171 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 PyUnicode_GET_LENGTH(self), maxcount
10174 );
10175 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010176 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 }
10178
10179 if (PyUnicode_READY(substring) == -1)
10180 return NULL;
10181
10182 kind1 = PyUnicode_KIND(self);
10183 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 len1 = PyUnicode_GET_LENGTH(self);
10185 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010186 if (kind1 < kind2 || len1 < len2) {
10187 out = PyList_New(1);
10188 if (out == NULL)
10189 return NULL;
10190 Py_INCREF(self);
10191 PyList_SET_ITEM(out, 0, self);
10192 return out;
10193 }
10194 buf1 = PyUnicode_DATA(self);
10195 buf2 = PyUnicode_DATA(substring);
10196 if (kind2 != kind1) {
10197 buf2 = _PyUnicode_AsKind(substring, kind1);
10198 if (!buf2)
10199 return NULL;
10200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010202 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10205 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 else
10208 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 break;
10211 case PyUnicode_2BYTE_KIND:
10212 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 break;
10215 case PyUnicode_4BYTE_KIND:
10216 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 break;
10219 default:
10220 out = NULL;
10221 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010222 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyMem_Free(buf2);
10224 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
10226
Alexander Belopolsky40018472011-02-26 01:02:56 +000010227static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010228rsplit(PyObject *self,
10229 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010230 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010231{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010232 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 void *buf1, *buf2;
10234 Py_ssize_t len1, len2;
10235 PyObject* out;
10236
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010237 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010238 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010244 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 if (PyUnicode_IS_ASCII(self))
10247 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 PyUnicode_GET_LENGTH(self), maxcount
10250 );
10251 else
10252 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_2BYTE_KIND:
10257 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
10261 case PyUnicode_4BYTE_KIND:
10262 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
10266 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010267 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 }
10269
10270 if (PyUnicode_READY(substring) == -1)
10271 return NULL;
10272
10273 kind1 = PyUnicode_KIND(self);
10274 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 len1 = PyUnicode_GET_LENGTH(self);
10276 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010277 if (kind1 < kind2 || len1 < len2) {
10278 out = PyList_New(1);
10279 if (out == NULL)
10280 return NULL;
10281 Py_INCREF(self);
10282 PyList_SET_ITEM(out, 0, self);
10283 return out;
10284 }
10285 buf1 = PyUnicode_DATA(self);
10286 buf2 = PyUnicode_DATA(substring);
10287 if (kind2 != kind1) {
10288 buf2 = _PyUnicode_AsKind(substring, kind1);
10289 if (!buf2)
10290 return NULL;
10291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010293 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10296 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010297 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010298 else
10299 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 break;
10302 case PyUnicode_2BYTE_KIND:
10303 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 break;
10306 case PyUnicode_4BYTE_KIND:
10307 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 break;
10310 default:
10311 out = NULL;
10312 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010313 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 PyMem_Free(buf2);
10315 return out;
10316}
10317
10318static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010319anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10320 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010322 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10325 return asciilib_find(buf1, len1, buf2, len2, offset);
10326 else
10327 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_2BYTE_KIND:
10329 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10330 case PyUnicode_4BYTE_KIND:
10331 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10332 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010333 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334}
10335
10336static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10338 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010340 switch (kind) {
10341 case PyUnicode_1BYTE_KIND:
10342 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10343 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10344 else
10345 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10346 case PyUnicode_2BYTE_KIND:
10347 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10348 case PyUnicode_4BYTE_KIND:
10349 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10350 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010351 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010352}
10353
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010354static void
10355replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10356 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10357{
10358 int kind = PyUnicode_KIND(u);
10359 void *data = PyUnicode_DATA(u);
10360 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10361 if (kind == PyUnicode_1BYTE_KIND) {
10362 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10363 (Py_UCS1 *)data + len,
10364 u1, u2, maxcount);
10365 }
10366 else if (kind == PyUnicode_2BYTE_KIND) {
10367 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10368 (Py_UCS2 *)data + len,
10369 u1, u2, maxcount);
10370 }
10371 else {
10372 assert(kind == PyUnicode_4BYTE_KIND);
10373 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10374 (Py_UCS4 *)data + len,
10375 u1, u2, maxcount);
10376 }
10377}
10378
Alexander Belopolsky40018472011-02-26 01:02:56 +000010379static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380replace(PyObject *self, PyObject *str1,
10381 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 PyObject *u;
10384 char *sbuf = PyUnicode_DATA(self);
10385 char *buf1 = PyUnicode_DATA(str1);
10386 char *buf2 = PyUnicode_DATA(str2);
10387 int srelease = 0, release1 = 0, release2 = 0;
10388 int skind = PyUnicode_KIND(self);
10389 int kind1 = PyUnicode_KIND(str1);
10390 int kind2 = PyUnicode_KIND(str2);
10391 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10392 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10393 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010394 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010395 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010400 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
Victor Stinner59de0ee2011-10-07 10:01:28 +020010402 if (str1 == str2)
10403 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404
Victor Stinner49a0a212011-10-12 23:46:10 +020010405 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010406 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10407 if (maxchar < maxchar_str1)
10408 /* substring too wide to be present */
10409 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10411 /* Replacing str1 with str2 may cause a maxchar reduction in the
10412 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010413 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010414 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010417 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010419 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010423 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010424
Victor Stinner69ed0f42013-04-09 21:48:24 +020010425 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010426 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010427 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010428 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010429 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010433
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010434 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10435 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010436 }
10437 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 int rkind = skind;
10439 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010440 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (kind1 < rkind) {
10443 /* widen substring */
10444 buf1 = _PyUnicode_AsKind(str1, rkind);
10445 if (!buf1) goto error;
10446 release1 = 1;
10447 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010448 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 if (i < 0)
10450 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (rkind > kind2) {
10452 /* widen replacement */
10453 buf2 = _PyUnicode_AsKind(str2, rkind);
10454 if (!buf2) goto error;
10455 release2 = 1;
10456 }
10457 else if (rkind < kind2) {
10458 /* widen self and buf1 */
10459 rkind = kind2;
10460 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010461 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 sbuf = _PyUnicode_AsKind(self, rkind);
10463 if (!sbuf) goto error;
10464 srelease = 1;
10465 buf1 = _PyUnicode_AsKind(str1, rkind);
10466 if (!buf1) goto error;
10467 release1 = 1;
10468 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 u = PyUnicode_New(slen, maxchar);
10470 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010472 assert(PyUnicode_KIND(u) == rkind);
10473 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010474
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010476 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010481
10482 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010484 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010485 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010486 if (i == -1)
10487 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010488 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010490 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 }
10495 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010497 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 int rkind = skind;
10499 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 buf1 = _PyUnicode_AsKind(str1, rkind);
10504 if (!buf1) goto error;
10505 release1 = 1;
10506 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010507 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 if (n == 0)
10509 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010511 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 buf2 = _PyUnicode_AsKind(str2, rkind);
10513 if (!buf2) goto error;
10514 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 rkind = kind2;
10519 sbuf = _PyUnicode_AsKind(self, rkind);
10520 if (!sbuf) goto error;
10521 srelease = 1;
10522 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010523 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 buf1 = _PyUnicode_AsKind(str1, rkind);
10525 if (!buf1) goto error;
10526 release1 = 1;
10527 }
10528 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10529 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010530 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 PyErr_SetString(PyExc_OverflowError,
10532 "replace string is too long");
10533 goto error;
10534 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010535 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010537 _Py_INCREF_UNICODE_EMPTY();
10538 if (!unicode_empty)
10539 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 u = unicode_empty;
10541 goto done;
10542 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010543 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 PyErr_SetString(PyExc_OverflowError,
10545 "replace string is too long");
10546 goto error;
10547 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010548 u = PyUnicode_New(new_size, maxchar);
10549 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010551 assert(PyUnicode_KIND(u) == rkind);
10552 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 ires = i = 0;
10554 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010555 while (n-- > 0) {
10556 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010559 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010560 if (j == -1)
10561 break;
10562 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010563 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010564 memcpy(res + rkind * ires,
10565 sbuf + rkind * i,
10566 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010568 }
10569 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010571 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010579 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 memcpy(res + rkind * ires,
10581 sbuf + rkind * i,
10582 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 }
10584 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010585 /* interleave */
10586 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010591 if (--n <= 0)
10592 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 memcpy(res + rkind * ires,
10594 sbuf + rkind * i,
10595 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 ires++;
10597 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010599 memcpy(res + rkind * ires,
10600 sbuf + rkind * i,
10601 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 }
10604
10605 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010606 unicode_adjust_maxchar(&u);
10607 if (u == NULL)
10608 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010610
10611 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (srelease)
10613 PyMem_FREE(sbuf);
10614 if (release1)
10615 PyMem_FREE(buf1);
10616 if (release2)
10617 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010618 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (srelease)
10624 PyMem_FREE(sbuf);
10625 if (release1)
10626 PyMem_FREE(buf1);
10627 if (release2)
10628 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010629 return unicode_result_unchanged(self);
10630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 error:
10632 if (srelease && sbuf)
10633 PyMem_FREE(sbuf);
10634 if (release1 && buf1)
10635 PyMem_FREE(buf1);
10636 if (release2 && buf2)
10637 PyMem_FREE(buf2);
10638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639}
10640
10641/* --- Unicode Object Methods --------------------------------------------- */
10642
INADA Naoki3ae20562017-01-16 20:41:20 +090010643/*[clinic input]
10644str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645
INADA Naoki3ae20562017-01-16 20:41:20 +090010646Return a version of the string where each word is titlecased.
10647
10648More specifically, words start with uppercased characters and all remaining
10649cased characters have lower case.
10650[clinic start generated code]*/
10651
10652static PyObject *
10653unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010654/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010656 if (PyUnicode_READY(self) == -1)
10657 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010658 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659}
10660
INADA Naoki3ae20562017-01-16 20:41:20 +090010661/*[clinic input]
10662str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663
INADA Naoki3ae20562017-01-16 20:41:20 +090010664Return a capitalized version of the string.
10665
10666More specifically, make the first character have upper case and the rest lower
10667case.
10668[clinic start generated code]*/
10669
10670static PyObject *
10671unicode_capitalize_impl(PyObject *self)
10672/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010674 if (PyUnicode_READY(self) == -1)
10675 return NULL;
10676 if (PyUnicode_GET_LENGTH(self) == 0)
10677 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010678 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679}
10680
INADA Naoki3ae20562017-01-16 20:41:20 +090010681/*[clinic input]
10682str.casefold as unicode_casefold
10683
10684Return a version of the string suitable for caseless comparisons.
10685[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010686
10687static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010688unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010689/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010690{
10691 if (PyUnicode_READY(self) == -1)
10692 return NULL;
10693 if (PyUnicode_IS_ASCII(self))
10694 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010695 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010696}
10697
10698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010699/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010700
10701static int
10702convert_uc(PyObject *obj, void *addr)
10703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010706 if (!PyUnicode_Check(obj)) {
10707 PyErr_Format(PyExc_TypeError,
10708 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010709 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010710 return 0;
10711 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010712 if (PyUnicode_READY(obj) < 0)
10713 return 0;
10714 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010715 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010717 return 0;
10718 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010719 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010720 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010721}
10722
INADA Naoki3ae20562017-01-16 20:41:20 +090010723/*[clinic input]
10724str.center as unicode_center
10725
10726 width: Py_ssize_t
10727 fillchar: Py_UCS4 = ' '
10728 /
10729
10730Return a centered string of length width.
10731
10732Padding is done using the specified fill character (default is a space).
10733[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
10735static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010736unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10737/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010739 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
Benjamin Petersonbac79492012-01-14 13:34:47 -050010741 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 return NULL;
10743
Victor Stinnerc4b49542011-12-11 22:44:26 +010010744 if (PyUnicode_GET_LENGTH(self) >= width)
10745 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
Victor Stinnerc4b49542011-12-11 22:44:26 +010010747 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 left = marg / 2 + (marg & width & 1);
10749
Victor Stinner9310abb2011-10-05 00:59:23 +020010750 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751}
10752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753/* This function assumes that str1 and str2 are readied by the caller. */
10754
Marc-André Lemburge5034372000-08-08 08:04:29 +000010755static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010756unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010757{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010758#define COMPARE(TYPE1, TYPE2) \
10759 do { \
10760 TYPE1* p1 = (TYPE1 *)data1; \
10761 TYPE2* p2 = (TYPE2 *)data2; \
10762 TYPE1* end = p1 + len; \
10763 Py_UCS4 c1, c2; \
10764 for (; p1 != end; p1++, p2++) { \
10765 c1 = *p1; \
10766 c2 = *p2; \
10767 if (c1 != c2) \
10768 return (c1 < c2) ? -1 : 1; \
10769 } \
10770 } \
10771 while (0)
10772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 int kind1, kind2;
10774 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010775 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 kind1 = PyUnicode_KIND(str1);
10778 kind2 = PyUnicode_KIND(str2);
10779 data1 = PyUnicode_DATA(str1);
10780 data2 = PyUnicode_DATA(str2);
10781 len1 = PyUnicode_GET_LENGTH(str1);
10782 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010783 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010784
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010785 switch(kind1) {
10786 case PyUnicode_1BYTE_KIND:
10787 {
10788 switch(kind2) {
10789 case PyUnicode_1BYTE_KIND:
10790 {
10791 int cmp = memcmp(data1, data2, len);
10792 /* normalize result of memcmp() into the range [-1; 1] */
10793 if (cmp < 0)
10794 return -1;
10795 if (cmp > 0)
10796 return 1;
10797 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010798 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010799 case PyUnicode_2BYTE_KIND:
10800 COMPARE(Py_UCS1, Py_UCS2);
10801 break;
10802 case PyUnicode_4BYTE_KIND:
10803 COMPARE(Py_UCS1, Py_UCS4);
10804 break;
10805 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010806 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010807 }
10808 break;
10809 }
10810 case PyUnicode_2BYTE_KIND:
10811 {
10812 switch(kind2) {
10813 case PyUnicode_1BYTE_KIND:
10814 COMPARE(Py_UCS2, Py_UCS1);
10815 break;
10816 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010817 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010818 COMPARE(Py_UCS2, Py_UCS2);
10819 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010820 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010821 case PyUnicode_4BYTE_KIND:
10822 COMPARE(Py_UCS2, Py_UCS4);
10823 break;
10824 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010825 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010826 }
10827 break;
10828 }
10829 case PyUnicode_4BYTE_KIND:
10830 {
10831 switch(kind2) {
10832 case PyUnicode_1BYTE_KIND:
10833 COMPARE(Py_UCS4, Py_UCS1);
10834 break;
10835 case PyUnicode_2BYTE_KIND:
10836 COMPARE(Py_UCS4, Py_UCS2);
10837 break;
10838 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010839 {
10840#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10841 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10842 /* normalize result of wmemcmp() into the range [-1; 1] */
10843 if (cmp < 0)
10844 return -1;
10845 if (cmp > 0)
10846 return 1;
10847#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010848 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010849#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010851 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010852 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010853 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010854 }
10855 break;
10856 }
10857 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010858 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859 }
10860
Victor Stinner770e19e2012-10-04 22:59:45 +020010861 if (len1 == len2)
10862 return 0;
10863 if (len1 < len2)
10864 return -1;
10865 else
10866 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010867
10868#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010869}
10870
Benjamin Peterson621b4302016-09-09 13:54:34 -070010871static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010872unicode_compare_eq(PyObject *str1, PyObject *str2)
10873{
10874 int kind;
10875 void *data1, *data2;
10876 Py_ssize_t len;
10877 int cmp;
10878
Victor Stinnere5567ad2012-10-23 02:48:49 +020010879 len = PyUnicode_GET_LENGTH(str1);
10880 if (PyUnicode_GET_LENGTH(str2) != len)
10881 return 0;
10882 kind = PyUnicode_KIND(str1);
10883 if (PyUnicode_KIND(str2) != kind)
10884 return 0;
10885 data1 = PyUnicode_DATA(str1);
10886 data2 = PyUnicode_DATA(str2);
10887
10888 cmp = memcmp(data1, data2, len * kind);
10889 return (cmp == 0);
10890}
10891
10892
Alexander Belopolsky40018472011-02-26 01:02:56 +000010893int
10894PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10897 if (PyUnicode_READY(left) == -1 ||
10898 PyUnicode_READY(right) == -1)
10899 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010900
10901 /* a string is equal to itself */
10902 if (left == right)
10903 return 0;
10904
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010905 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010907 PyErr_Format(PyExc_TypeError,
10908 "Can't compare %.100s and %.100s",
10909 left->ob_type->tp_name,
10910 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 return -1;
10912}
10913
Martin v. Löwis5b222132007-06-10 09:51:05 +000010914int
10915PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 Py_ssize_t i;
10918 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010920 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921
Victor Stinner910337b2011-10-03 03:20:16 +020010922 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010923 if (!PyUnicode_IS_READY(uni)) {
10924 const wchar_t *ws = _PyUnicode_WSTR(uni);
10925 /* Compare Unicode string and source character set string */
10926 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10927 if (chr != ustr[i])
10928 return (chr < ustr[i]) ? -1 : 1;
10929 }
10930 /* This check keeps Python strings that end in '\0' from comparing equal
10931 to C strings identical up to that point. */
10932 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10933 return 1; /* uni is longer */
10934 if (ustr[i])
10935 return -1; /* str is longer */
10936 return 0;
10937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010939 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010940 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010941 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010942 size_t len, len2 = strlen(str);
10943 int cmp;
10944
10945 len = Py_MIN(len1, len2);
10946 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010947 if (cmp != 0) {
10948 if (cmp < 0)
10949 return -1;
10950 else
10951 return 1;
10952 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010953 if (len1 > len2)
10954 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010955 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010956 return -1; /* str is longer */
10957 return 0;
10958 }
10959 else {
10960 void *data = PyUnicode_DATA(uni);
10961 /* Compare Unicode string and source character set string */
10962 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010963 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010964 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10965 /* This check keeps Python strings that end in '\0' from comparing equal
10966 to C strings identical up to that point. */
10967 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10968 return 1; /* uni is longer */
10969 if (str[i])
10970 return -1; /* str is longer */
10971 return 0;
10972 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010973}
10974
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010975static int
10976non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
10977{
10978 size_t i, len;
10979 const wchar_t *p;
10980 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
10981 if (strlen(str) != len)
10982 return 0;
10983 p = _PyUnicode_WSTR(unicode);
10984 assert(p);
10985 for (i = 0; i < len; i++) {
10986 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020010987 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010988 return 0;
10989 }
10990 return 1;
10991}
10992
10993int
10994_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10995{
10996 size_t len;
10997 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020010998 assert(str);
10999#ifndef NDEBUG
11000 for (const char *p = str; *p; p++) {
11001 assert((unsigned char)*p < 128);
11002 }
11003#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011004 if (PyUnicode_READY(unicode) == -1) {
11005 /* Memory error or bad data */
11006 PyErr_Clear();
11007 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11008 }
11009 if (!PyUnicode_IS_ASCII(unicode))
11010 return 0;
11011 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11012 return strlen(str) == len &&
11013 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11014}
11015
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011016int
11017_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11018{
11019 PyObject *right_uni;
11020 Py_hash_t hash;
11021
11022 assert(_PyUnicode_CHECK(left));
11023 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011024#ifndef NDEBUG
11025 for (const char *p = right->string; *p; p++) {
11026 assert((unsigned char)*p < 128);
11027 }
11028#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011029
11030 if (PyUnicode_READY(left) == -1) {
11031 /* memory error or bad data */
11032 PyErr_Clear();
11033 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11034 }
11035
11036 if (!PyUnicode_IS_ASCII(left))
11037 return 0;
11038
11039 right_uni = _PyUnicode_FromId(right); /* borrowed */
11040 if (right_uni == NULL) {
11041 /* memory error or bad data */
11042 PyErr_Clear();
11043 return _PyUnicode_EqualToASCIIString(left, right->string);
11044 }
11045
11046 if (left == right_uni)
11047 return 1;
11048
11049 if (PyUnicode_CHECK_INTERNED(left))
11050 return 0;
11051
INADA Naoki7cc95f52018-01-28 02:07:09 +090011052 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011053 hash = _PyUnicode_HASH(left);
11054 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11055 return 0;
11056
11057 return unicode_compare_eq(left, right_uni);
11058}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011059
Alexander Belopolsky40018472011-02-26 01:02:56 +000011060PyObject *
11061PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011062{
11063 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011064
Victor Stinnere5567ad2012-10-23 02:48:49 +020011065 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11066 Py_RETURN_NOTIMPLEMENTED;
11067
11068 if (PyUnicode_READY(left) == -1 ||
11069 PyUnicode_READY(right) == -1)
11070 return NULL;
11071
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011072 if (left == right) {
11073 switch (op) {
11074 case Py_EQ:
11075 case Py_LE:
11076 case Py_GE:
11077 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011078 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011079 case Py_NE:
11080 case Py_LT:
11081 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011082 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011083 default:
11084 PyErr_BadArgument();
11085 return NULL;
11086 }
11087 }
11088 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011089 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011090 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011091 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011092 }
11093 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011094 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011095 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011096 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011097}
11098
Alexander Belopolsky40018472011-02-26 01:02:56 +000011099int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011100_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11101{
11102 return unicode_eq(aa, bb);
11103}
11104
11105int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011106PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011107{
Victor Stinner77282cb2013-04-14 19:22:47 +020011108 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 void *buf1, *buf2;
11110 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011111 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011112
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011113 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011115 "'in <string>' requires string as left operand, not %.100s",
11116 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011118 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011119 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011120 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011121 if (ensure_unicode(str) < 0)
11122 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011125 kind2 = PyUnicode_KIND(substr);
11126 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011127 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011129 len2 = PyUnicode_GET_LENGTH(substr);
11130 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011132 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011133 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011134 if (len2 == 1) {
11135 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11136 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011137 return result;
11138 }
11139 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011140 buf2 = _PyUnicode_AsKind(substr, kind1);
11141 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011142 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144
Victor Stinner77282cb2013-04-14 19:22:47 +020011145 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 case PyUnicode_1BYTE_KIND:
11147 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11148 break;
11149 case PyUnicode_2BYTE_KIND:
11150 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11151 break;
11152 case PyUnicode_4BYTE_KIND:
11153 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11154 break;
11155 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011156 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
Victor Stinner77282cb2013-04-14 19:22:47 +020011159 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 PyMem_Free(buf2);
11161
Guido van Rossum403d68b2000-03-13 15:55:09 +000011162 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011163}
11164
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165/* Concat to string or Unicode object giving a new Unicode object. */
11166
Alexander Belopolsky40018472011-02-26 01:02:56 +000011167PyObject *
11168PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011170 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011171 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011174 if (ensure_unicode(left) < 0)
11175 return NULL;
11176
11177 if (!PyUnicode_Check(right)) {
11178 PyErr_Format(PyExc_TypeError,
11179 "can only concatenate str (not \"%.200s\") to str",
11180 right->ob_type->tp_name);
11181 return NULL;
11182 }
11183 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
11186 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187 if (left == unicode_empty)
11188 return PyUnicode_FromObject(right);
11189 if (right == unicode_empty)
11190 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011192 left_len = PyUnicode_GET_LENGTH(left);
11193 right_len = PyUnicode_GET_LENGTH(right);
11194 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011195 PyErr_SetString(PyExc_OverflowError,
11196 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011197 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011198 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011199 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011200
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011201 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11202 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011203 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011206 result = PyUnicode_New(new_len, maxchar);
11207 if (result == NULL)
11208 return NULL;
11209 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11210 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11211 assert(_PyUnicode_CheckConsistency(result, 1));
11212 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213}
11214
Walter Dörwald1ab83302007-05-18 17:15:44 +000011215void
Victor Stinner23e56682011-10-03 03:54:37 +020011216PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011217{
Victor Stinner23e56682011-10-03 03:54:37 +020011218 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011219 Py_UCS4 maxchar, maxchar2;
11220 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011221
11222 if (p_left == NULL) {
11223 if (!PyErr_Occurred())
11224 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011225 return;
11226 }
Victor Stinner23e56682011-10-03 03:54:37 +020011227 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011228 if (right == NULL || left == NULL
11229 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011230 if (!PyErr_Occurred())
11231 PyErr_BadInternalCall();
11232 goto error;
11233 }
11234
Benjamin Petersonbac79492012-01-14 13:34:47 -050011235 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011236 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011237 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011238 goto error;
11239
Victor Stinner488fa492011-12-12 00:01:39 +010011240 /* Shortcuts */
11241 if (left == unicode_empty) {
11242 Py_DECREF(left);
11243 Py_INCREF(right);
11244 *p_left = right;
11245 return;
11246 }
11247 if (right == unicode_empty)
11248 return;
11249
11250 left_len = PyUnicode_GET_LENGTH(left);
11251 right_len = PyUnicode_GET_LENGTH(right);
11252 if (left_len > PY_SSIZE_T_MAX - right_len) {
11253 PyErr_SetString(PyExc_OverflowError,
11254 "strings are too large to concat");
11255 goto error;
11256 }
11257 new_len = left_len + right_len;
11258
11259 if (unicode_modifiable(left)
11260 && PyUnicode_CheckExact(right)
11261 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011262 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11263 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011264 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011265 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011266 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11267 {
11268 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011269 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011270 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011271
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011272 /* copy 'right' into the newly allocated area of 'left' */
11273 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011274 }
Victor Stinner488fa492011-12-12 00:01:39 +010011275 else {
11276 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11277 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011278 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011279
Victor Stinner488fa492011-12-12 00:01:39 +010011280 /* Concat the two Unicode strings */
11281 res = PyUnicode_New(new_len, maxchar);
11282 if (res == NULL)
11283 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011284 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11285 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011286 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011287 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011288 }
11289 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011290 return;
11291
11292error:
Victor Stinner488fa492011-12-12 00:01:39 +010011293 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011294}
11295
11296void
11297PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11298{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011299 PyUnicode_Append(pleft, right);
11300 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011301}
11302
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011303/*
11304Wraps stringlib_parse_args_finds() and additionally ensures that the
11305first argument is a unicode object.
11306*/
11307
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011308static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011309parse_args_finds_unicode(const char * function_name, PyObject *args,
11310 PyObject **substring,
11311 Py_ssize_t *start, Py_ssize_t *end)
11312{
11313 if(stringlib_parse_args_finds(function_name, args, substring,
11314 start, end)) {
11315 if (ensure_unicode(*substring) < 0)
11316 return 0;
11317 return 1;
11318 }
11319 return 0;
11320}
11321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011325Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011326string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011327interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
11329static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011330unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011332 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011333 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011334 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011336 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 void *buf1, *buf2;
11338 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011340 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 kind1 = PyUnicode_KIND(self);
11344 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011345 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011346 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 len1 = PyUnicode_GET_LENGTH(self);
11349 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011351 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011352 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011353
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011354 buf1 = PyUnicode_DATA(self);
11355 buf2 = PyUnicode_DATA(substring);
11356 if (kind2 != kind1) {
11357 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011358 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011359 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011360 }
11361 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 case PyUnicode_1BYTE_KIND:
11363 iresult = ucs1lib_count(
11364 ((Py_UCS1*)buf1) + start, end - start,
11365 buf2, len2, PY_SSIZE_T_MAX
11366 );
11367 break;
11368 case PyUnicode_2BYTE_KIND:
11369 iresult = ucs2lib_count(
11370 ((Py_UCS2*)buf1) + start, end - start,
11371 buf2, len2, PY_SSIZE_T_MAX
11372 );
11373 break;
11374 case PyUnicode_4BYTE_KIND:
11375 iresult = ucs4lib_count(
11376 ((Py_UCS4*)buf1) + start, end - start,
11377 buf2, len2, PY_SSIZE_T_MAX
11378 );
11379 break;
11380 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011381 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 }
11383
11384 result = PyLong_FromSsize_t(iresult);
11385
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011386 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 return result;
11390}
11391
INADA Naoki3ae20562017-01-16 20:41:20 +090011392/*[clinic input]
11393str.encode as unicode_encode
11394
11395 encoding: str(c_default="NULL") = 'utf-8'
11396 The encoding in which to encode the string.
11397 errors: str(c_default="NULL") = 'strict'
11398 The error handling scheme to use for encoding errors.
11399 The default is 'strict' meaning that encoding errors raise a
11400 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11401 'xmlcharrefreplace' as well as any other name registered with
11402 codecs.register_error that can handle UnicodeEncodeErrors.
11403
11404Encode the string using the codec registered for encoding.
11405[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
11407static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011408unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011409/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011411 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011412}
11413
INADA Naoki3ae20562017-01-16 20:41:20 +090011414/*[clinic input]
11415str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
INADA Naoki3ae20562017-01-16 20:41:20 +090011417 tabsize: int = 8
11418
11419Return a copy where all tab characters are expanded using spaces.
11420
11421If tabsize is not given, a tab size of 8 characters is assumed.
11422[clinic start generated code]*/
11423
11424static PyObject *
11425unicode_expandtabs_impl(PyObject *self, int tabsize)
11426/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011428 Py_ssize_t i, j, line_pos, src_len, incr;
11429 Py_UCS4 ch;
11430 PyObject *u;
11431 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011432 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011433 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
Antoine Pitrou22425222011-10-04 19:10:51 +020011435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437
Thomas Wouters7e474022000-07-16 12:04:32 +000011438 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011439 src_len = PyUnicode_GET_LENGTH(self);
11440 i = j = line_pos = 0;
11441 kind = PyUnicode_KIND(self);
11442 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011443 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011444 for (; i < src_len; i++) {
11445 ch = PyUnicode_READ(kind, src_data, i);
11446 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011447 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 goto overflow;
11452 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011454 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 goto overflow;
11459 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011461 if (ch == '\n' || ch == '\r')
11462 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011464 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011465 if (!found)
11466 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011467
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 if (!u)
11471 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011472 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
Antoine Pitroue71d5742011-10-04 15:55:09 +020011476 for (; i < src_len; i++) {
11477 ch = PyUnicode_READ(kind, src_data, i);
11478 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011480 incr = tabsize - (line_pos % tabsize);
11481 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011482 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011483 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011485 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011487 line_pos++;
11488 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011489 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011490 if (ch == '\n' || ch == '\r')
11491 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 }
11494 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011495 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011496
Antoine Pitroue71d5742011-10-04 15:55:09 +020011497 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011498 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500}
11501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011502PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504\n\
11505Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011506such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507arguments start and end are interpreted as in slice notation.\n\
11508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011514 /* initialize variables to prevent gcc warning */
11515 PyObject *substring = NULL;
11516 Py_ssize_t start = 0;
11517 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011518 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011520 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011526 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (result == -2)
11529 return NULL;
11530
Christian Heimes217cfd12007-12-02 14:31:20 +000011531 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532}
11533
11534static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011535unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011537 void *data;
11538 enum PyUnicode_Kind kind;
11539 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011540
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011541 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011542 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011544 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011545 if (PyUnicode_READY(self) == -1) {
11546 return NULL;
11547 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011548 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11549 PyErr_SetString(PyExc_IndexError, "string index out of range");
11550 return NULL;
11551 }
11552 kind = PyUnicode_KIND(self);
11553 data = PyUnicode_DATA(self);
11554 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011555 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
Guido van Rossumc2504932007-09-18 19:42:40 +000011558/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011559 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011560static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011561unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011563 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011564
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011565#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011566 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011567#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 if (_PyUnicode_HASH(self) != -1)
11569 return _PyUnicode_HASH(self);
11570 if (PyUnicode_READY(self) == -1)
11571 return -1;
animalizea1d14252019-01-02 20:16:06 +080011572
Christian Heimes985ecdc2013-11-20 11:46:18 +010011573 x = _Py_HashBytes(PyUnicode_DATA(self),
11574 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011576 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577}
11578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011579PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581\n\
oldkaa0735f2018-02-02 16:52:55 +080011582Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011583such that sub is contained within S[start:end]. Optional\n\
11584arguments start and end are interpreted as in slice notation.\n\
11585\n\
11586Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011591 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011592 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011593 PyObject *substring = NULL;
11594 Py_ssize_t start = 0;
11595 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011597 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011600 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011603 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (result == -2)
11606 return NULL;
11607
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 if (result < 0) {
11609 PyErr_SetString(PyExc_ValueError, "substring not found");
11610 return NULL;
11611 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011612
Christian Heimes217cfd12007-12-02 14:31:20 +000011613 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
INADA Naoki3ae20562017-01-16 20:41:20 +090011616/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011617str.isascii as unicode_isascii
11618
11619Return True if all characters in the string are ASCII, False otherwise.
11620
11621ASCII characters have code points in the range U+0000-U+007F.
11622Empty string is ASCII too.
11623[clinic start generated code]*/
11624
11625static PyObject *
11626unicode_isascii_impl(PyObject *self)
11627/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11628{
11629 if (PyUnicode_READY(self) == -1) {
11630 return NULL;
11631 }
11632 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11633}
11634
11635/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011636str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
INADA Naoki3ae20562017-01-16 20:41:20 +090011638Return True if the string is a lowercase string, False otherwise.
11639
11640A string is lowercase if all cased characters in the string are lowercase and
11641there is at least one cased character in the string.
11642[clinic start generated code]*/
11643
11644static PyObject *
11645unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011646/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 Py_ssize_t i, length;
11649 int kind;
11650 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651 int cased;
11652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 if (PyUnicode_READY(self) == -1)
11654 return NULL;
11655 length = PyUnicode_GET_LENGTH(self);
11656 kind = PyUnicode_KIND(self);
11657 data = PyUnicode_DATA(self);
11658
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 if (length == 1)
11661 return PyBool_FromLong(
11662 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011664 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011666 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011667
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 for (i = 0; i < length; i++) {
11670 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011671
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011673 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 else if (!cased && Py_UNICODE_ISLOWER(ch))
11675 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011677 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678}
11679
INADA Naoki3ae20562017-01-16 20:41:20 +090011680/*[clinic input]
11681str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
INADA Naoki3ae20562017-01-16 20:41:20 +090011683Return True if the string is an uppercase string, False otherwise.
11684
11685A string is uppercase if all cased characters in the string are uppercase and
11686there is at least one cased character in the string.
11687[clinic start generated code]*/
11688
11689static PyObject *
11690unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011691/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 Py_ssize_t i, length;
11694 int kind;
11695 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696 int cased;
11697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 if (PyUnicode_READY(self) == -1)
11699 return NULL;
11700 length = PyUnicode_GET_LENGTH(self);
11701 kind = PyUnicode_KIND(self);
11702 data = PyUnicode_DATA(self);
11703
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (length == 1)
11706 return PyBool_FromLong(
11707 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011709 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011711 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011712
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 for (i = 0; i < length; i++) {
11715 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011716
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011718 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 else if (!cased && Py_UNICODE_ISUPPER(ch))
11720 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011722 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
INADA Naoki3ae20562017-01-16 20:41:20 +090011725/*[clinic input]
11726str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
INADA Naoki3ae20562017-01-16 20:41:20 +090011728Return True if the string is a title-cased string, False otherwise.
11729
11730In a title-cased string, upper- and title-case characters may only
11731follow uncased characters and lowercase characters only cased ones.
11732[clinic start generated code]*/
11733
11734static PyObject *
11735unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011736/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 Py_ssize_t i, length;
11739 int kind;
11740 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741 int cased, previous_is_cased;
11742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 if (PyUnicode_READY(self) == -1)
11744 return NULL;
11745 length = PyUnicode_GET_LENGTH(self);
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_DATA(self);
11748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 1) {
11751 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11752 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11753 (Py_UNICODE_ISUPPER(ch) != 0));
11754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011756 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011758 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011759
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 cased = 0;
11761 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 for (i = 0; i < length; i++) {
11763 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011764
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11766 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011767 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 previous_is_cased = 1;
11769 cased = 1;
11770 }
11771 else if (Py_UNICODE_ISLOWER(ch)) {
11772 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011773 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 previous_is_cased = 1;
11775 cased = 1;
11776 }
11777 else
11778 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011780 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
INADA Naoki3ae20562017-01-16 20:41:20 +090011783/*[clinic input]
11784str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
INADA Naoki3ae20562017-01-16 20:41:20 +090011786Return True if the string is a whitespace string, False otherwise.
11787
11788A string is whitespace if all characters in the string are whitespace and there
11789is at least one character in the string.
11790[clinic start generated code]*/
11791
11792static PyObject *
11793unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011794/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 Py_ssize_t i, length;
11797 int kind;
11798 void *data;
11799
11800 if (PyUnicode_READY(self) == -1)
11801 return NULL;
11802 length = PyUnicode_GET_LENGTH(self);
11803 kind = PyUnicode_KIND(self);
11804 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (length == 1)
11808 return PyBool_FromLong(
11809 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011811 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011813 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 for (i = 0; i < length; i++) {
11816 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011817 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011818 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011820 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821}
11822
INADA Naoki3ae20562017-01-16 20:41:20 +090011823/*[clinic input]
11824str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011825
INADA Naoki3ae20562017-01-16 20:41:20 +090011826Return True if the string is an alphabetic string, False otherwise.
11827
11828A string is alphabetic if all characters in the string are alphabetic and there
11829is at least one character in the string.
11830[clinic start generated code]*/
11831
11832static PyObject *
11833unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011834/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 Py_ssize_t i, length;
11837 int kind;
11838 void *data;
11839
11840 if (PyUnicode_READY(self) == -1)
11841 return NULL;
11842 length = PyUnicode_GET_LENGTH(self);
11843 kind = PyUnicode_KIND(self);
11844 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011845
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011846 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (length == 1)
11848 return PyBool_FromLong(
11849 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011850
11851 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011853 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 for (i = 0; i < length; i++) {
11856 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011857 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011858 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011859 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011860}
11861
INADA Naoki3ae20562017-01-16 20:41:20 +090011862/*[clinic input]
11863str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865Return True if the string is an alpha-numeric string, False otherwise.
11866
11867A string is alpha-numeric if all characters in the string are alpha-numeric and
11868there is at least one character in the string.
11869[clinic start generated code]*/
11870
11871static PyObject *
11872unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011873/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 int kind;
11876 void *data;
11877 Py_ssize_t len, i;
11878
11879 if (PyUnicode_READY(self) == -1)
11880 return NULL;
11881
11882 kind = PyUnicode_KIND(self);
11883 data = PyUnicode_DATA(self);
11884 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011885
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011886 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (len == 1) {
11888 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11889 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11890 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011891
11892 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 for (i = 0; i < len; i++) {
11897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011898 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011899 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011900 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011902}
11903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904/*[clinic input]
11905str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
INADA Naoki3ae20562017-01-16 20:41:20 +090011907Return True if the string is a decimal string, False otherwise.
11908
11909A string is a decimal string if all characters in the string are decimal and
11910there is at least one character in the string.
11911[clinic start generated code]*/
11912
11913static PyObject *
11914unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011915/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 Py_ssize_t i, length;
11918 int kind;
11919 void *data;
11920
11921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923 length = PyUnicode_GET_LENGTH(self);
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 1)
11929 return PyBool_FromLong(
11930 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 for (i = 0; i < length; i++) {
11937 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941}
11942
INADA Naoki3ae20562017-01-16 20:41:20 +090011943/*[clinic input]
11944str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946Return True if the string is a digit string, False otherwise.
11947
11948A string is a digit string if all characters in the string are digits and there
11949is at least one character in the string.
11950[clinic start generated code]*/
11951
11952static PyObject *
11953unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011954/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 Py_ssize_t i, length;
11957 int kind;
11958 void *data;
11959
11960 if (PyUnicode_READY(self) == -1)
11961 return NULL;
11962 length = PyUnicode_GET_LENGTH(self);
11963 kind = PyUnicode_KIND(self);
11964 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (length == 1) {
11968 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11969 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011972 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 for (i = 0; i < length; i++) {
11977 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011978 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981}
11982
INADA Naoki3ae20562017-01-16 20:41:20 +090011983/*[clinic input]
11984str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
INADA Naoki3ae20562017-01-16 20:41:20 +090011986Return True if the string is a numeric string, False otherwise.
11987
11988A string is numeric if all characters in the string are numeric and there is at
11989least one character in the string.
11990[clinic start generated code]*/
11991
11992static PyObject *
11993unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011994/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 Py_ssize_t i, length;
11997 int kind;
11998 void *data;
11999
12000 if (PyUnicode_READY(self) == -1)
12001 return NULL;
12002 length = PyUnicode_GET_LENGTH(self);
12003 kind = PyUnicode_KIND(self);
12004 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 if (length == 1)
12008 return PyBool_FromLong(
12009 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012011 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012013 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 for (i = 0; i < length; i++) {
12016 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012019 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020}
12021
Martin v. Löwis47383402007-08-15 07:32:56 +000012022int
12023PyUnicode_IsIdentifier(PyObject *self)
12024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 int kind;
12026 void *data;
12027 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012028 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (PyUnicode_READY(self) == -1) {
12031 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 }
12034
12035 /* Special case for empty strings */
12036 if (PyUnicode_GET_LENGTH(self) == 0)
12037 return 0;
12038 kind = PyUnicode_KIND(self);
12039 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012040
12041 /* PEP 3131 says that the first character must be in
12042 XID_Start and subsequent characters in XID_Continue,
12043 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012045 letters, digits, underscore). However, given the current
12046 definition of XID_Start and XID_Continue, it is sufficient
12047 to check just for these, except that _ must be allowed
12048 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012050 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012051 return 0;
12052
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012053 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012056 return 1;
12057}
12058
INADA Naoki3ae20562017-01-16 20:41:20 +090012059/*[clinic input]
12060str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012061
INADA Naoki3ae20562017-01-16 20:41:20 +090012062Return True if the string is a valid Python identifier, False otherwise.
12063
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012064Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012065such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012066[clinic start generated code]*/
12067
12068static PyObject *
12069unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012070/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012071{
12072 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12073}
12074
INADA Naoki3ae20562017-01-16 20:41:20 +090012075/*[clinic input]
12076str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078Return True if the string is printable, False otherwise.
12079
12080A string is printable if all of its characters are considered printable in
12081repr() or if it is empty.
12082[clinic start generated code]*/
12083
12084static PyObject *
12085unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012086/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 Py_ssize_t i, length;
12089 int kind;
12090 void *data;
12091
12092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094 length = PyUnicode_GET_LENGTH(self);
12095 kind = PyUnicode_KIND(self);
12096 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012097
12098 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (length == 1)
12100 return PyBool_FromLong(
12101 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 for (i = 0; i < length; i++) {
12104 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 Py_RETURN_FALSE;
12106 }
12107 }
12108 Py_RETURN_TRUE;
12109}
12110
INADA Naoki3ae20562017-01-16 20:41:20 +090012111/*[clinic input]
12112str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
INADA Naoki3ae20562017-01-16 20:41:20 +090012114 iterable: object
12115 /
12116
12117Concatenate any number of strings.
12118
Martin Panter91a88662017-01-24 00:30:06 +000012119The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012120The result is returned as a new string.
12121
12122Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12123[clinic start generated code]*/
12124
12125static PyObject *
12126unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012127/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128{
INADA Naoki3ae20562017-01-16 20:41:20 +090012129 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130}
12131
Martin v. Löwis18e16552006-02-15 17:27:45 +000012132static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012133unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 if (PyUnicode_READY(self) == -1)
12136 return -1;
12137 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138}
12139
INADA Naoki3ae20562017-01-16 20:41:20 +090012140/*[clinic input]
12141str.ljust as unicode_ljust
12142
12143 width: Py_ssize_t
12144 fillchar: Py_UCS4 = ' '
12145 /
12146
12147Return a left-justified string of length width.
12148
12149Padding is done using the specified fill character (default is a space).
12150[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
12152static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012153unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12154/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012156 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Victor Stinnerc4b49542011-12-11 22:44:26 +010012159 if (PyUnicode_GET_LENGTH(self) >= width)
12160 return unicode_result_unchanged(self);
12161
12162 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163}
12164
INADA Naoki3ae20562017-01-16 20:41:20 +090012165/*[clinic input]
12166str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
INADA Naoki3ae20562017-01-16 20:41:20 +090012168Return a copy of the string converted to lowercase.
12169[clinic start generated code]*/
12170
12171static PyObject *
12172unicode_lower_impl(PyObject *self)
12173/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012175 if (PyUnicode_READY(self) == -1)
12176 return NULL;
12177 if (PyUnicode_IS_ASCII(self))
12178 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012179 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180}
12181
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012182#define LEFTSTRIP 0
12183#define RIGHTSTRIP 1
12184#define BOTHSTRIP 2
12185
12186/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012187static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012188
INADA Naoki3ae20562017-01-16 20:41:20 +090012189#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012190
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012191/* externally visible for str.strip(unicode) */
12192PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012193_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 void *data;
12196 int kind;
12197 Py_ssize_t i, j, len;
12198 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012199 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12202 return NULL;
12203
12204 kind = PyUnicode_KIND(self);
12205 data = PyUnicode_DATA(self);
12206 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012207 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12209 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012210 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211
Benjamin Peterson14339b62009-01-31 16:36:08 +000012212 i = 0;
12213 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012214 while (i < len) {
12215 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12216 if (!BLOOM(sepmask, ch))
12217 break;
12218 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12219 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 i++;
12221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012222 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012223
Benjamin Peterson14339b62009-01-31 16:36:08 +000012224 j = len;
12225 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012226 j--;
12227 while (j >= i) {
12228 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12229 if (!BLOOM(sepmask, ch))
12230 break;
12231 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12232 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012234 }
12235
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012237 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012238
Victor Stinner7931d9a2011-11-04 00:22:48 +010012239 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240}
12241
12242PyObject*
12243PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12244{
12245 unsigned char *data;
12246 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012247 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248
Victor Stinnerde636f32011-10-01 03:55:54 +020012249 if (PyUnicode_READY(self) == -1)
12250 return NULL;
12251
Victor Stinner684d5fd2012-05-03 02:32:34 +020012252 length = PyUnicode_GET_LENGTH(self);
12253 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012254
Victor Stinner684d5fd2012-05-03 02:32:34 +020012255 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012256 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257
Victor Stinnerde636f32011-10-01 03:55:54 +020012258 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012259 PyErr_SetString(PyExc_IndexError, "string index out of range");
12260 return NULL;
12261 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012262 if (start >= length || end < start)
12263 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012264
Victor Stinner684d5fd2012-05-03 02:32:34 +020012265 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012266 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012267 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012268 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012269 }
12270 else {
12271 kind = PyUnicode_KIND(self);
12272 data = PyUnicode_1BYTE_DATA(self);
12273 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012274 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012275 length);
12276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
12279static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012280do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 Py_ssize_t len, i, j;
12283
12284 if (PyUnicode_READY(self) == -1)
12285 return NULL;
12286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288
Victor Stinnercc7af722013-04-09 22:39:24 +020012289 if (PyUnicode_IS_ASCII(self)) {
12290 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12291
12292 i = 0;
12293 if (striptype != RIGHTSTRIP) {
12294 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012295 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012296 if (!_Py_ascii_whitespace[ch])
12297 break;
12298 i++;
12299 }
12300 }
12301
12302 j = len;
12303 if (striptype != LEFTSTRIP) {
12304 j--;
12305 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012306 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012307 if (!_Py_ascii_whitespace[ch])
12308 break;
12309 j--;
12310 }
12311 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312 }
12313 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012314 else {
12315 int kind = PyUnicode_KIND(self);
12316 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317
Victor Stinnercc7af722013-04-09 22:39:24 +020012318 i = 0;
12319 if (striptype != RIGHTSTRIP) {
12320 while (i < len) {
12321 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12322 if (!Py_UNICODE_ISSPACE(ch))
12323 break;
12324 i++;
12325 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012326 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012327
12328 j = len;
12329 if (striptype != LEFTSTRIP) {
12330 j--;
12331 while (j >= i) {
12332 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12333 if (!Py_UNICODE_ISSPACE(ch))
12334 break;
12335 j--;
12336 }
12337 j++;
12338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340
Victor Stinner7931d9a2011-11-04 00:22:48 +010012341 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342}
12343
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012344
12345static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012346do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 if (sep != NULL && sep != Py_None) {
12349 if (PyUnicode_Check(sep))
12350 return _PyUnicode_XStrip(self, striptype, sep);
12351 else {
12352 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 "%s arg must be None or str",
12354 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012355 return NULL;
12356 }
12357 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358
Benjamin Peterson14339b62009-01-31 16:36:08 +000012359 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360}
12361
12362
INADA Naoki3ae20562017-01-16 20:41:20 +090012363/*[clinic input]
12364str.strip as unicode_strip
12365
12366 chars: object = None
12367 /
12368
Victor Stinner0c4a8282017-01-17 02:21:47 +010012369Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012370
12371If chars is given and not None, remove characters in chars instead.
12372[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012373
12374static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012375unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012376/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012377{
INADA Naoki3ae20562017-01-16 20:41:20 +090012378 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012379}
12380
12381
INADA Naoki3ae20562017-01-16 20:41:20 +090012382/*[clinic input]
12383str.lstrip as unicode_lstrip
12384
12385 chars: object = NULL
12386 /
12387
12388Return a copy of the string with leading whitespace removed.
12389
12390If chars is given and not None, remove characters in chars instead.
12391[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012392
12393static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012394unicode_lstrip_impl(PyObject *self, PyObject *chars)
12395/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012396{
INADA Naoki3ae20562017-01-16 20:41:20 +090012397 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012398}
12399
12400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401/*[clinic input]
12402str.rstrip as unicode_rstrip
12403
12404 chars: object = NULL
12405 /
12406
12407Return a copy of the string with trailing whitespace removed.
12408
12409If chars is given and not None, remove characters in chars instead.
12410[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012411
12412static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012413unicode_rstrip_impl(PyObject *self, PyObject *chars)
12414/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012415{
INADA Naoki3ae20562017-01-16 20:41:20 +090012416 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012417}
12418
12419
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012421unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012423 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425
Serhiy Storchaka05997252013-01-26 12:14:02 +020012426 if (len < 1)
12427 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428
Victor Stinnerc4b49542011-12-11 22:44:26 +010012429 /* no repeat, return original string */
12430 if (len == 1)
12431 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012432
Benjamin Petersonbac79492012-01-14 13:34:47 -050012433 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 return NULL;
12435
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012436 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012437 PyErr_SetString(PyExc_OverflowError,
12438 "repeated string is too long");
12439 return NULL;
12440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012442
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012443 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 if (!u)
12445 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012446 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 if (PyUnicode_GET_LENGTH(str) == 1) {
12449 const int kind = PyUnicode_KIND(str);
12450 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012451 if (kind == PyUnicode_1BYTE_KIND) {
12452 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012453 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012454 }
12455 else if (kind == PyUnicode_2BYTE_KIND) {
12456 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012457 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012458 ucs2[n] = fill_char;
12459 } else {
12460 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12461 assert(kind == PyUnicode_4BYTE_KIND);
12462 for (n = 0; n < len; ++n)
12463 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 }
12466 else {
12467 /* number of characters copied this far */
12468 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012469 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012471 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012475 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478 }
12479
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012480 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482}
12483
Alexander Belopolsky40018472011-02-26 01:02:56 +000012484PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012485PyUnicode_Replace(PyObject *str,
12486 PyObject *substr,
12487 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012488 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012490 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12491 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012493 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494}
12495
INADA Naoki3ae20562017-01-16 20:41:20 +090012496/*[clinic input]
12497str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
INADA Naoki3ae20562017-01-16 20:41:20 +090012499 old: unicode
12500 new: unicode
12501 count: Py_ssize_t = -1
12502 Maximum number of occurrences to replace.
12503 -1 (the default value) means replace all occurrences.
12504 /
12505
12506Return a copy with all occurrences of substring old replaced by new.
12507
12508If the optional argument count is given, only the first count occurrences are
12509replaced.
12510[clinic start generated code]*/
12511
12512static PyObject *
12513unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12514 Py_ssize_t count)
12515/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012517 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012519 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520}
12521
Alexander Belopolsky40018472011-02-26 01:02:56 +000012522static PyObject *
12523unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012525 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 Py_ssize_t isize;
12527 Py_ssize_t osize, squote, dquote, i, o;
12528 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012529 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012533 return NULL;
12534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 isize = PyUnicode_GET_LENGTH(unicode);
12536 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 /* Compute length of output, quote characters, and
12539 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012540 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 max = 127;
12542 squote = dquote = 0;
12543 ikind = PyUnicode_KIND(unicode);
12544 for (i = 0; i < isize; i++) {
12545 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012546 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012548 case '\'': squote++; break;
12549 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012551 incr = 2;
12552 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 default:
12554 /* Fast-path ASCII */
12555 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012556 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012558 ;
12559 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012562 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012564 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012566 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012568 if (osize > PY_SSIZE_T_MAX - incr) {
12569 PyErr_SetString(PyExc_OverflowError,
12570 "string is too long to generate repr");
12571 return NULL;
12572 }
12573 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 }
12575
12576 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012577 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012579 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 if (dquote)
12581 /* Both squote and dquote present. Use squote,
12582 and escape them */
12583 osize += squote;
12584 else
12585 quote = '"';
12586 }
Victor Stinner55c08782013-04-14 18:45:39 +020012587 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588
12589 repr = PyUnicode_New(osize, max);
12590 if (repr == NULL)
12591 return NULL;
12592 okind = PyUnicode_KIND(repr);
12593 odata = PyUnicode_DATA(repr);
12594
12595 PyUnicode_WRITE(okind, odata, 0, quote);
12596 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012597 if (unchanged) {
12598 _PyUnicode_FastCopyCharacters(repr, 1,
12599 unicode, 0,
12600 isize);
12601 }
12602 else {
12603 for (i = 0, o = 1; i < isize; i++) {
12604 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605
Victor Stinner55c08782013-04-14 18:45:39 +020012606 /* Escape quotes and backslashes */
12607 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012608 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012610 continue;
12611 }
12612
12613 /* Map special whitespace to '\t', \n', '\r' */
12614 if (ch == '\t') {
12615 PyUnicode_WRITE(okind, odata, o++, '\\');
12616 PyUnicode_WRITE(okind, odata, o++, 't');
12617 }
12618 else if (ch == '\n') {
12619 PyUnicode_WRITE(okind, odata, o++, '\\');
12620 PyUnicode_WRITE(okind, odata, o++, 'n');
12621 }
12622 else if (ch == '\r') {
12623 PyUnicode_WRITE(okind, odata, o++, '\\');
12624 PyUnicode_WRITE(okind, odata, o++, 'r');
12625 }
12626
12627 /* Map non-printable US ASCII to '\xhh' */
12628 else if (ch < ' ' || ch == 0x7F) {
12629 PyUnicode_WRITE(okind, odata, o++, '\\');
12630 PyUnicode_WRITE(okind, odata, o++, 'x');
12631 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12632 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12633 }
12634
12635 /* Copy ASCII characters as-is */
12636 else if (ch < 0x7F) {
12637 PyUnicode_WRITE(okind, odata, o++, ch);
12638 }
12639
12640 /* Non-ASCII characters */
12641 else {
12642 /* Map Unicode whitespace and control characters
12643 (categories Z* and C* except ASCII space)
12644 */
12645 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12646 PyUnicode_WRITE(okind, odata, o++, '\\');
12647 /* Map 8-bit characters to '\xhh' */
12648 if (ch <= 0xff) {
12649 PyUnicode_WRITE(okind, odata, o++, 'x');
12650 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12651 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12652 }
12653 /* Map 16-bit characters to '\uxxxx' */
12654 else if (ch <= 0xffff) {
12655 PyUnicode_WRITE(okind, odata, o++, 'u');
12656 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12657 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12658 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12659 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12660 }
12661 /* Map 21-bit characters to '\U00xxxxxx' */
12662 else {
12663 PyUnicode_WRITE(okind, odata, o++, 'U');
12664 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12665 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12666 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12667 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12668 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12669 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12670 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12672 }
12673 }
12674 /* Copy characters as-is */
12675 else {
12676 PyUnicode_WRITE(okind, odata, o++, ch);
12677 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012678 }
12679 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012682 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012683 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012686PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688\n\
12689Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012690such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691arguments start and end are interpreted as in slice notation.\n\
12692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012693Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012698 /* initialize variables to prevent gcc warning */
12699 PyObject *substring = NULL;
12700 Py_ssize_t start = 0;
12701 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012702 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012704 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012707 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012710 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 if (result == -2)
12713 return NULL;
12714
Christian Heimes217cfd12007-12-02 14:31:20 +000012715 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716}
12717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012718PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012721Return the highest index in S where substring sub is found,\n\
12722such that sub is contained within S[start:end]. Optional\n\
12723arguments start and end are interpreted as in slice notation.\n\
12724\n\
12725Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726
12727static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012730 /* initialize variables to prevent gcc warning */
12731 PyObject *substring = NULL;
12732 Py_ssize_t start = 0;
12733 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012734 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012736 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012739 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012742 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 if (result == -2)
12745 return NULL;
12746
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747 if (result < 0) {
12748 PyErr_SetString(PyExc_ValueError, "substring not found");
12749 return NULL;
12750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751
Christian Heimes217cfd12007-12-02 14:31:20 +000012752 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753}
12754
INADA Naoki3ae20562017-01-16 20:41:20 +090012755/*[clinic input]
12756str.rjust as unicode_rjust
12757
12758 width: Py_ssize_t
12759 fillchar: Py_UCS4 = ' '
12760 /
12761
12762Return a right-justified string of length width.
12763
12764Padding is done using the specified fill character (default is a space).
12765[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766
12767static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012768unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12769/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012771 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772 return NULL;
12773
Victor Stinnerc4b49542011-12-11 22:44:26 +010012774 if (PyUnicode_GET_LENGTH(self) >= width)
12775 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776
Victor Stinnerc4b49542011-12-11 22:44:26 +010012777 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778}
12779
Alexander Belopolsky40018472011-02-26 01:02:56 +000012780PyObject *
12781PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012783 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012786 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787}
12788
INADA Naoki3ae20562017-01-16 20:41:20 +090012789/*[clinic input]
12790str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791
INADA Naoki3ae20562017-01-16 20:41:20 +090012792 sep: object = None
12793 The delimiter according which to split the string.
12794 None (the default value) means split according to any whitespace,
12795 and discard empty strings from the result.
12796 maxsplit: Py_ssize_t = -1
12797 Maximum number of splits to do.
12798 -1 (the default value) means no limit.
12799
12800Return a list of the words in the string, using sep as the delimiter string.
12801[clinic start generated code]*/
12802
12803static PyObject *
12804unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12805/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806{
INADA Naoki3ae20562017-01-16 20:41:20 +090012807 if (sep == Py_None)
12808 return split(self, NULL, maxsplit);
12809 if (PyUnicode_Check(sep))
12810 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012811
Victor Stinner998b8062018-09-12 00:23:25 +020012812 PyErr_Format(PyExc_TypeError,
12813 "must be str or None, not %.100s",
12814 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816}
12817
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012819PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012820{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012822 int kind1, kind2;
12823 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012826 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828
Victor Stinner14f8f022011-10-05 20:58:25 +020012829 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 len1 = PyUnicode_GET_LENGTH(str_obj);
12832 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012833 if (kind1 < kind2 || len1 < len2) {
12834 _Py_INCREF_UNICODE_EMPTY();
12835 if (!unicode_empty)
12836 out = NULL;
12837 else {
12838 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12839 Py_DECREF(unicode_empty);
12840 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012841 return out;
12842 }
12843 buf1 = PyUnicode_DATA(str_obj);
12844 buf2 = PyUnicode_DATA(sep_obj);
12845 if (kind2 != kind1) {
12846 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12847 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012848 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012851 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012853 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12854 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12855 else
12856 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 break;
12858 case PyUnicode_2BYTE_KIND:
12859 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12860 break;
12861 case PyUnicode_4BYTE_KIND:
12862 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12863 break;
12864 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012865 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012868 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870
12871 return out;
12872}
12873
12874
12875PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012876PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012877{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012879 int kind1, kind2;
12880 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012882
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012883 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012884 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012885
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012886 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 len1 = PyUnicode_GET_LENGTH(str_obj);
12889 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012890 if (kind1 < kind2 || len1 < len2) {
12891 _Py_INCREF_UNICODE_EMPTY();
12892 if (!unicode_empty)
12893 out = NULL;
12894 else {
12895 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12896 Py_DECREF(unicode_empty);
12897 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012898 return out;
12899 }
12900 buf1 = PyUnicode_DATA(str_obj);
12901 buf2 = PyUnicode_DATA(sep_obj);
12902 if (kind2 != kind1) {
12903 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12904 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012905 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012908 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012910 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12911 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12912 else
12913 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 break;
12915 case PyUnicode_2BYTE_KIND:
12916 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12917 break;
12918 case PyUnicode_4BYTE_KIND:
12919 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12920 break;
12921 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012922 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012925 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
12928 return out;
12929}
12930
INADA Naoki3ae20562017-01-16 20:41:20 +090012931/*[clinic input]
12932str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933
INADA Naoki3ae20562017-01-16 20:41:20 +090012934 sep: object
12935 /
12936
12937Partition the string into three parts using the given separator.
12938
12939This will search for the separator in the string. If the separator is found,
12940returns a 3-tuple containing the part before the separator, the separator
12941itself, and the part after it.
12942
12943If the separator is not found, returns a 3-tuple containing the original string
12944and two empty strings.
12945[clinic start generated code]*/
12946
12947static PyObject *
12948unicode_partition(PyObject *self, PyObject *sep)
12949/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012950{
INADA Naoki3ae20562017-01-16 20:41:20 +090012951 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012952}
12953
INADA Naoki3ae20562017-01-16 20:41:20 +090012954/*[clinic input]
12955str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012956
INADA Naoki3ae20562017-01-16 20:41:20 +090012957Partition the string into three parts using the given separator.
12958
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012959This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090012960the separator is found, returns a 3-tuple containing the part before the
12961separator, the separator itself, and the part after it.
12962
12963If the separator is not found, returns a 3-tuple containing two empty strings
12964and the original string.
12965[clinic start generated code]*/
12966
12967static PyObject *
12968unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012969/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970{
INADA Naoki3ae20562017-01-16 20:41:20 +090012971 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972}
12973
Alexander Belopolsky40018472011-02-26 01:02:56 +000012974PyObject *
12975PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012976{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012977 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012978 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012979
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012981}
12982
INADA Naoki3ae20562017-01-16 20:41:20 +090012983/*[clinic input]
12984str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012985
INADA Naoki3ae20562017-01-16 20:41:20 +090012986Return a list of the words in the string, using sep as the delimiter string.
12987
12988Splits are done starting at the end of the string and working to the front.
12989[clinic start generated code]*/
12990
12991static PyObject *
12992unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12993/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012994{
INADA Naoki3ae20562017-01-16 20:41:20 +090012995 if (sep == Py_None)
12996 return rsplit(self, NULL, maxsplit);
12997 if (PyUnicode_Check(sep))
12998 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012999
Victor Stinner998b8062018-09-12 00:23:25 +020013000 PyErr_Format(PyExc_TypeError,
13001 "must be str or None, not %.100s",
13002 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013003 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013004}
13005
INADA Naoki3ae20562017-01-16 20:41:20 +090013006/*[clinic input]
13007str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013009 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013010
13011Return a list of the lines in the string, breaking at line boundaries.
13012
13013Line breaks are not included in the resulting list unless keepends is given and
13014true.
13015[clinic start generated code]*/
13016
13017static PyObject *
13018unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013019/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013021 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022}
13023
13024static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013025PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013027 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028}
13029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030/*[clinic input]
13031str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
INADA Naoki3ae20562017-01-16 20:41:20 +090013033Convert uppercase characters to lowercase and lowercase characters to uppercase.
13034[clinic start generated code]*/
13035
13036static PyObject *
13037unicode_swapcase_impl(PyObject *self)
13038/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013040 if (PyUnicode_READY(self) == -1)
13041 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013042 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043}
13044
Larry Hastings61272b72014-01-07 12:41:53 -080013045/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013046
Larry Hastings31826802013-10-19 00:09:25 -070013047@staticmethod
13048str.maketrans as unicode_maketrans
13049
13050 x: object
13051
13052 y: unicode=NULL
13053
13054 z: unicode=NULL
13055
13056 /
13057
13058Return a translation table usable for str.translate().
13059
13060If there is only one argument, it must be a dictionary mapping Unicode
13061ordinals (integers) or characters to Unicode ordinals, strings or None.
13062Character keys will be then converted to ordinals.
13063If there are two arguments, they must be strings of equal length, and
13064in the resulting dictionary, each character in x will be mapped to the
13065character at the same position in y. If there is a third argument, it
13066must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013067[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013068
Larry Hastings31826802013-10-19 00:09:25 -070013069static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013070unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013071/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013072{
Georg Brandlceee0772007-11-27 23:48:05 +000013073 PyObject *new = NULL, *key, *value;
13074 Py_ssize_t i = 0;
13075 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013076
Georg Brandlceee0772007-11-27 23:48:05 +000013077 new = PyDict_New();
13078 if (!new)
13079 return NULL;
13080 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 int x_kind, y_kind, z_kind;
13082 void *x_data, *y_data, *z_data;
13083
Georg Brandlceee0772007-11-27 23:48:05 +000013084 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013085 if (!PyUnicode_Check(x)) {
13086 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13087 "be a string if there is a second argument");
13088 goto err;
13089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013091 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13092 "arguments must have equal length");
13093 goto err;
13094 }
13095 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 x_kind = PyUnicode_KIND(x);
13097 y_kind = PyUnicode_KIND(y);
13098 x_data = PyUnicode_DATA(x);
13099 y_data = PyUnicode_DATA(y);
13100 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13101 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013102 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013103 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013104 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013105 if (!value) {
13106 Py_DECREF(key);
13107 goto err;
13108 }
Georg Brandlceee0772007-11-27 23:48:05 +000013109 res = PyDict_SetItem(new, key, value);
13110 Py_DECREF(key);
13111 Py_DECREF(value);
13112 if (res < 0)
13113 goto err;
13114 }
13115 /* create entries for deleting chars in z */
13116 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 z_kind = PyUnicode_KIND(z);
13118 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013119 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013121 if (!key)
13122 goto err;
13123 res = PyDict_SetItem(new, key, Py_None);
13124 Py_DECREF(key);
13125 if (res < 0)
13126 goto err;
13127 }
13128 }
13129 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 int kind;
13131 void *data;
13132
Georg Brandlceee0772007-11-27 23:48:05 +000013133 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013134 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013135 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13136 "to maketrans it must be a dict");
13137 goto err;
13138 }
13139 /* copy entries into the new dict, converting string keys to int keys */
13140 while (PyDict_Next(x, &i, &key, &value)) {
13141 if (PyUnicode_Check(key)) {
13142 /* convert string keys to integer keys */
13143 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013144 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013145 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13146 "table must be of length 1");
13147 goto err;
13148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 kind = PyUnicode_KIND(key);
13150 data = PyUnicode_DATA(key);
13151 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013152 if (!newkey)
13153 goto err;
13154 res = PyDict_SetItem(new, newkey, value);
13155 Py_DECREF(newkey);
13156 if (res < 0)
13157 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013158 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013159 /* just keep integer keys */
13160 if (PyDict_SetItem(new, key, value) < 0)
13161 goto err;
13162 } else {
13163 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13164 "be strings or integers");
13165 goto err;
13166 }
13167 }
13168 }
13169 return new;
13170 err:
13171 Py_DECREF(new);
13172 return NULL;
13173}
13174
INADA Naoki3ae20562017-01-16 20:41:20 +090013175/*[clinic input]
13176str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177
INADA Naoki3ae20562017-01-16 20:41:20 +090013178 table: object
13179 Translation table, which must be a mapping of Unicode ordinals to
13180 Unicode ordinals, strings, or None.
13181 /
13182
13183Replace each character in the string using the given translation table.
13184
13185The table must implement lookup/indexing via __getitem__, for instance a
13186dictionary or list. If this operation raises LookupError, the character is
13187left untouched. Characters mapped to None are deleted.
13188[clinic start generated code]*/
13189
13190static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013192/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195}
13196
INADA Naoki3ae20562017-01-16 20:41:20 +090013197/*[clinic input]
13198str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
INADA Naoki3ae20562017-01-16 20:41:20 +090013200Return a copy of the string converted to uppercase.
13201[clinic start generated code]*/
13202
13203static PyObject *
13204unicode_upper_impl(PyObject *self)
13205/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013207 if (PyUnicode_READY(self) == -1)
13208 return NULL;
13209 if (PyUnicode_IS_ASCII(self))
13210 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013211 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212}
13213
INADA Naoki3ae20562017-01-16 20:41:20 +090013214/*[clinic input]
13215str.zfill as unicode_zfill
13216
13217 width: Py_ssize_t
13218 /
13219
13220Pad a numeric string with zeros on the left, to fill a field of the given width.
13221
13222The string is never truncated.
13223[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224
13225static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013226unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013227/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013229 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013230 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 int kind;
13232 void *data;
13233 Py_UCS4 chr;
13234
Benjamin Petersonbac79492012-01-14 13:34:47 -050013235 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237
Victor Stinnerc4b49542011-12-11 22:44:26 +010013238 if (PyUnicode_GET_LENGTH(self) >= width)
13239 return unicode_result_unchanged(self);
13240
13241 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242
13243 u = pad(self, fill, 0, '0');
13244
Walter Dörwald068325e2002-04-15 13:36:47 +000013245 if (u == NULL)
13246 return NULL;
13247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 kind = PyUnicode_KIND(u);
13249 data = PyUnicode_DATA(u);
13250 chr = PyUnicode_READ(kind, data, fill);
13251
13252 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 PyUnicode_WRITE(kind, data, 0, chr);
13255 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 }
13257
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013258 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013259 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261
13262#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013263static PyObject *
13264unicode__decimal2ascii(PyObject *self)
13265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013267}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268#endif
13269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013270PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013272\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013273Return True if S starts with the specified prefix, False otherwise.\n\
13274With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013275With optional end, stop comparing S at that position.\n\
13276prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277
13278static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013279unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013282 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013283 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013284 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013285 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013286 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287
Jesus Ceaac451502011-04-20 17:09:23 +020013288 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013290 if (PyTuple_Check(subobj)) {
13291 Py_ssize_t i;
13292 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013293 substring = PyTuple_GET_ITEM(subobj, i);
13294 if (!PyUnicode_Check(substring)) {
13295 PyErr_Format(PyExc_TypeError,
13296 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013297 "not %.100s",
13298 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013299 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013300 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013301 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013302 if (result == -1)
13303 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013304 if (result) {
13305 Py_RETURN_TRUE;
13306 }
13307 }
13308 /* nothing matched */
13309 Py_RETURN_FALSE;
13310 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013311 if (!PyUnicode_Check(subobj)) {
13312 PyErr_Format(PyExc_TypeError,
13313 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013314 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013316 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013317 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013318 if (result == -1)
13319 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013320 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321}
13322
13323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013324PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013327Return True if S ends with the specified suffix, False otherwise.\n\
13328With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329With optional end, stop comparing S at that position.\n\
13330suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331
13332static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013333unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013336 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013337 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013338 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013339 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013340 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
Jesus Ceaac451502011-04-20 17:09:23 +020013342 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013344 if (PyTuple_Check(subobj)) {
13345 Py_ssize_t i;
13346 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013347 substring = PyTuple_GET_ITEM(subobj, i);
13348 if (!PyUnicode_Check(substring)) {
13349 PyErr_Format(PyExc_TypeError,
13350 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013351 "not %.100s",
13352 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013353 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013354 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013355 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013356 if (result == -1)
13357 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013358 if (result) {
13359 Py_RETURN_TRUE;
13360 }
13361 }
13362 Py_RETURN_FALSE;
13363 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013364 if (!PyUnicode_Check(subobj)) {
13365 PyErr_Format(PyExc_TypeError,
13366 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013367 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013369 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013370 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013371 if (result == -1)
13372 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013373 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374}
13375
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013376static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013377_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013378{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013379 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13380 writer->data = PyUnicode_DATA(writer->buffer);
13381
13382 if (!writer->readonly) {
13383 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013384 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013385 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013386 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013387 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13388 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13389 writer->kind = PyUnicode_WCHAR_KIND;
13390 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13391
Victor Stinner8f674cc2013-04-17 23:02:17 +020013392 /* Copy-on-write mode: set buffer size to 0 so
13393 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13394 * next write. */
13395 writer->size = 0;
13396 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013397}
13398
Victor Stinnerd3f08822012-05-29 12:57:52 +020013399void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013400_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013401{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013402 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013403
13404 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013405 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013406
13407 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13408 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13409 writer->kind = PyUnicode_WCHAR_KIND;
13410 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013411}
13412
Victor Stinnerd3f08822012-05-29 12:57:52 +020013413int
13414_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13415 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013416{
13417 Py_ssize_t newlen;
13418 PyObject *newbuffer;
13419
Victor Stinner2740e462016-09-06 16:58:36 -070013420 assert(maxchar <= MAX_UNICODE);
13421
Victor Stinnerca9381e2015-09-22 00:58:32 +020013422 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013423 assert((maxchar > writer->maxchar && length >= 0)
13424 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013425
Victor Stinner202fdca2012-05-07 12:47:02 +020013426 if (length > PY_SSIZE_T_MAX - writer->pos) {
13427 PyErr_NoMemory();
13428 return -1;
13429 }
13430 newlen = writer->pos + length;
13431
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013432 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013433
Victor Stinnerd3f08822012-05-29 12:57:52 +020013434 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013435 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013436 if (writer->overallocate
13437 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13438 /* overallocate to limit the number of realloc() */
13439 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013440 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013441 if (newlen < writer->min_length)
13442 newlen = writer->min_length;
13443
Victor Stinnerd3f08822012-05-29 12:57:52 +020013444 writer->buffer = PyUnicode_New(newlen, maxchar);
13445 if (writer->buffer == NULL)
13446 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013447 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013448 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013449 if (writer->overallocate
13450 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13451 /* overallocate to limit the number of realloc() */
13452 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013453 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013454 if (newlen < writer->min_length)
13455 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013456
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013457 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013458 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013459 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013460 newbuffer = PyUnicode_New(newlen, maxchar);
13461 if (newbuffer == NULL)
13462 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013463 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13464 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013465 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013466 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013467 }
13468 else {
13469 newbuffer = resize_compact(writer->buffer, newlen);
13470 if (newbuffer == NULL)
13471 return -1;
13472 }
13473 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013474 }
13475 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013476 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477 newbuffer = PyUnicode_New(writer->size, maxchar);
13478 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013479 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13481 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013482 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013483 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013484 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013485 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013486
13487#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013488}
13489
Victor Stinnerca9381e2015-09-22 00:58:32 +020013490int
13491_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13492 enum PyUnicode_Kind kind)
13493{
13494 Py_UCS4 maxchar;
13495
13496 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13497 assert(writer->kind < kind);
13498
13499 switch (kind)
13500 {
13501 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13502 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13503 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13504 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013505 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013506 }
13507
13508 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13509}
13510
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013511static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013512_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013513{
Victor Stinner2740e462016-09-06 16:58:36 -070013514 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013515 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13516 return -1;
13517 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13518 writer->pos++;
13519 return 0;
13520}
13521
13522int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013523_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13524{
13525 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13526}
13527
13528int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13530{
13531 Py_UCS4 maxchar;
13532 Py_ssize_t len;
13533
13534 if (PyUnicode_READY(str) == -1)
13535 return -1;
13536 len = PyUnicode_GET_LENGTH(str);
13537 if (len == 0)
13538 return 0;
13539 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13540 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013541 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013542 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013543 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 Py_INCREF(str);
13545 writer->buffer = str;
13546 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547 writer->pos += len;
13548 return 0;
13549 }
13550 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13551 return -1;
13552 }
13553 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13554 str, 0, len);
13555 writer->pos += len;
13556 return 0;
13557}
13558
Victor Stinnere215d962012-10-06 23:03:36 +020013559int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013560_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13561 Py_ssize_t start, Py_ssize_t end)
13562{
13563 Py_UCS4 maxchar;
13564 Py_ssize_t len;
13565
13566 if (PyUnicode_READY(str) == -1)
13567 return -1;
13568
13569 assert(0 <= start);
13570 assert(end <= PyUnicode_GET_LENGTH(str));
13571 assert(start <= end);
13572
13573 if (end == 0)
13574 return 0;
13575
13576 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13577 return _PyUnicodeWriter_WriteStr(writer, str);
13578
13579 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13580 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13581 else
13582 maxchar = writer->maxchar;
13583 len = end - start;
13584
13585 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13586 return -1;
13587
13588 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13589 str, start, len);
13590 writer->pos += len;
13591 return 0;
13592}
13593
13594int
Victor Stinner4a587072013-11-19 12:54:53 +010013595_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13596 const char *ascii, Py_ssize_t len)
13597{
13598 if (len == -1)
13599 len = strlen(ascii);
13600
13601 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13602
13603 if (writer->buffer == NULL && !writer->overallocate) {
13604 PyObject *str;
13605
13606 str = _PyUnicode_FromASCII(ascii, len);
13607 if (str == NULL)
13608 return -1;
13609
13610 writer->readonly = 1;
13611 writer->buffer = str;
13612 _PyUnicodeWriter_Update(writer);
13613 writer->pos += len;
13614 return 0;
13615 }
13616
13617 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13618 return -1;
13619
13620 switch (writer->kind)
13621 {
13622 case PyUnicode_1BYTE_KIND:
13623 {
13624 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13625 Py_UCS1 *data = writer->data;
13626
Christian Heimesf051e432016-09-13 20:22:02 +020013627 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013628 break;
13629 }
13630 case PyUnicode_2BYTE_KIND:
13631 {
13632 _PyUnicode_CONVERT_BYTES(
13633 Py_UCS1, Py_UCS2,
13634 ascii, ascii + len,
13635 (Py_UCS2 *)writer->data + writer->pos);
13636 break;
13637 }
13638 case PyUnicode_4BYTE_KIND:
13639 {
13640 _PyUnicode_CONVERT_BYTES(
13641 Py_UCS1, Py_UCS4,
13642 ascii, ascii + len,
13643 (Py_UCS4 *)writer->data + writer->pos);
13644 break;
13645 }
13646 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013647 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013648 }
13649
13650 writer->pos += len;
13651 return 0;
13652}
13653
13654int
13655_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13656 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013657{
13658 Py_UCS4 maxchar;
13659
13660 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13661 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13662 return -1;
13663 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13664 writer->pos += len;
13665 return 0;
13666}
13667
Victor Stinnerd3f08822012-05-29 12:57:52 +020013668PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013669_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013670{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013671 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013672
Victor Stinnerd3f08822012-05-29 12:57:52 +020013673 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013674 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013675 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013676 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013677
13678 str = writer->buffer;
13679 writer->buffer = NULL;
13680
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013681 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013682 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13683 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013684 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013685
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013686 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13687 PyObject *str2;
13688 str2 = resize_compact(str, writer->pos);
13689 if (str2 == NULL) {
13690 Py_DECREF(str);
13691 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013692 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013693 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013694 }
13695
Victor Stinner15a0bd32013-07-08 22:29:55 +020013696 assert(_PyUnicode_CheckConsistency(str, 1));
13697 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013698}
13699
Victor Stinnerd3f08822012-05-29 12:57:52 +020013700void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013701_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013702{
13703 Py_CLEAR(writer->buffer);
13704}
13705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013707
13708PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013710\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013711Return a formatted version of S, using substitutions from args and kwargs.\n\
13712The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013713
Eric Smith27bbca62010-11-04 17:06:58 +000013714PyDoc_STRVAR(format_map__doc__,
13715 "S.format_map(mapping) -> str\n\
13716\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013717Return a formatted version of S, using substitutions from mapping.\n\
13718The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013719
INADA Naoki3ae20562017-01-16 20:41:20 +090013720/*[clinic input]
13721str.__format__ as unicode___format__
13722
13723 format_spec: unicode
13724 /
13725
13726Return a formatted version of the string as described by format_spec.
13727[clinic start generated code]*/
13728
Eric Smith4a7d76d2008-05-30 18:10:19 +000013729static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013730unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013731/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013732{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013733 _PyUnicodeWriter writer;
13734 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013735
Victor Stinnerd3f08822012-05-29 12:57:52 +020013736 if (PyUnicode_READY(self) == -1)
13737 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013738 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013739 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13740 self, format_spec, 0,
13741 PyUnicode_GET_LENGTH(format_spec));
13742 if (ret == -1) {
13743 _PyUnicodeWriter_Dealloc(&writer);
13744 return NULL;
13745 }
13746 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013747}
13748
INADA Naoki3ae20562017-01-16 20:41:20 +090013749/*[clinic input]
13750str.__sizeof__ as unicode_sizeof
13751
13752Return the size of the string in memory, in bytes.
13753[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013754
13755static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013756unicode_sizeof_impl(PyObject *self)
13757/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013759 Py_ssize_t size;
13760
13761 /* If it's a compact object, account for base structure +
13762 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013763 if (PyUnicode_IS_COMPACT_ASCII(self))
13764 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13765 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013766 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013767 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013768 else {
13769 /* If it is a two-block object, account for base object, and
13770 for character block if present. */
13771 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013772 if (_PyUnicode_DATA_ANY(self))
13773 size += (PyUnicode_GET_LENGTH(self) + 1) *
13774 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013775 }
13776 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013777 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013778 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13779 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13780 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13781 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013782
13783 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013784}
13785
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013786static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013787unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013788{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013789 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 if (!copy)
13791 return NULL;
13792 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013793}
13794
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013796 UNICODE_ENCODE_METHODDEF
13797 UNICODE_REPLACE_METHODDEF
13798 UNICODE_SPLIT_METHODDEF
13799 UNICODE_RSPLIT_METHODDEF
13800 UNICODE_JOIN_METHODDEF
13801 UNICODE_CAPITALIZE_METHODDEF
13802 UNICODE_CASEFOLD_METHODDEF
13803 UNICODE_TITLE_METHODDEF
13804 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013805 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013806 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013807 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013808 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013809 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013810 UNICODE_LJUST_METHODDEF
13811 UNICODE_LOWER_METHODDEF
13812 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013813 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13814 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013815 UNICODE_RJUST_METHODDEF
13816 UNICODE_RSTRIP_METHODDEF
13817 UNICODE_RPARTITION_METHODDEF
13818 UNICODE_SPLITLINES_METHODDEF
13819 UNICODE_STRIP_METHODDEF
13820 UNICODE_SWAPCASE_METHODDEF
13821 UNICODE_TRANSLATE_METHODDEF
13822 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013823 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13824 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013825 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013826 UNICODE_ISLOWER_METHODDEF
13827 UNICODE_ISUPPER_METHODDEF
13828 UNICODE_ISTITLE_METHODDEF
13829 UNICODE_ISSPACE_METHODDEF
13830 UNICODE_ISDECIMAL_METHODDEF
13831 UNICODE_ISDIGIT_METHODDEF
13832 UNICODE_ISNUMERIC_METHODDEF
13833 UNICODE_ISALPHA_METHODDEF
13834 UNICODE_ISALNUM_METHODDEF
13835 UNICODE_ISIDENTIFIER_METHODDEF
13836 UNICODE_ISPRINTABLE_METHODDEF
13837 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013838 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013839 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013840 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013841 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013842 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013843#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013844 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013845 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013846#endif
13847
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013848 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849 {NULL, NULL}
13850};
13851
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013852static PyObject *
13853unicode_mod(PyObject *v, PyObject *w)
13854{
Brian Curtindfc80e32011-08-10 20:28:54 -050013855 if (!PyUnicode_Check(v))
13856 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013857 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013858}
13859
13860static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 0, /*nb_add*/
13862 0, /*nb_subtract*/
13863 0, /*nb_multiply*/
13864 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013865};
13866
Guido van Rossumd57fd912000-03-10 22:53:23 +000013867static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013868 (lenfunc) unicode_length, /* sq_length */
13869 PyUnicode_Concat, /* sq_concat */
13870 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13871 (ssizeargfunc) unicode_getitem, /* sq_item */
13872 0, /* sq_slice */
13873 0, /* sq_ass_item */
13874 0, /* sq_ass_slice */
13875 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876};
13877
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013878static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013879unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 if (PyUnicode_READY(self) == -1)
13882 return NULL;
13883
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013884 if (PyIndex_Check(item)) {
13885 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013886 if (i == -1 && PyErr_Occurred())
13887 return NULL;
13888 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013890 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013891 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013892 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013893 PyObject *result;
13894 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013895 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013896 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013897
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013898 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013899 return NULL;
13900 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013901 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13902 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013903
13904 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013905 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013906 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013907 slicelength == PyUnicode_GET_LENGTH(self)) {
13908 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013909 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013910 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013911 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013912 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013913 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013914 src_kind = PyUnicode_KIND(self);
13915 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013916 if (!PyUnicode_IS_ASCII(self)) {
13917 kind_limit = kind_maxchar_limit(src_kind);
13918 max_char = 0;
13919 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13920 ch = PyUnicode_READ(src_kind, src_data, cur);
13921 if (ch > max_char) {
13922 max_char = ch;
13923 if (max_char >= kind_limit)
13924 break;
13925 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013926 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013927 }
Victor Stinner55c99112011-10-13 01:17:06 +020013928 else
13929 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013930 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013931 if (result == NULL)
13932 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013933 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013934 dest_data = PyUnicode_DATA(result);
13935
13936 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013937 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13938 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013939 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013940 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013941 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013942 } else {
13943 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13944 return NULL;
13945 }
13946}
13947
13948static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 (lenfunc)unicode_length, /* mp_length */
13950 (binaryfunc)unicode_subscript, /* mp_subscript */
13951 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013952};
13953
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954
Guido van Rossumd57fd912000-03-10 22:53:23 +000013955/* Helpers for PyUnicode_Format() */
13956
Victor Stinnera47082312012-10-04 02:19:54 +020013957struct unicode_formatter_t {
13958 PyObject *args;
13959 int args_owned;
13960 Py_ssize_t arglen, argidx;
13961 PyObject *dict;
13962
13963 enum PyUnicode_Kind fmtkind;
13964 Py_ssize_t fmtcnt, fmtpos;
13965 void *fmtdata;
13966 PyObject *fmtstr;
13967
13968 _PyUnicodeWriter writer;
13969};
13970
13971struct unicode_format_arg_t {
13972 Py_UCS4 ch;
13973 int flags;
13974 Py_ssize_t width;
13975 int prec;
13976 int sign;
13977};
13978
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013980unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981{
Victor Stinnera47082312012-10-04 02:19:54 +020013982 Py_ssize_t argidx = ctx->argidx;
13983
13984 if (argidx < ctx->arglen) {
13985 ctx->argidx++;
13986 if (ctx->arglen < 0)
13987 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 else
Victor Stinnera47082312012-10-04 02:19:54 +020013989 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990 }
13991 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013992 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993 return NULL;
13994}
13995
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013996/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997
Victor Stinnera47082312012-10-04 02:19:54 +020013998/* Format a float into the writer if the writer is not NULL, or into *p_output
13999 otherwise.
14000
14001 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014002static int
Victor Stinnera47082312012-10-04 02:19:54 +020014003formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14004 PyObject **p_output,
14005 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014006{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014007 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014008 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014009 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014010 int prec;
14011 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014012
Guido van Rossumd57fd912000-03-10 22:53:23 +000014013 x = PyFloat_AsDouble(v);
14014 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014015 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014016
Victor Stinnera47082312012-10-04 02:19:54 +020014017 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014018 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014020
Victor Stinnera47082312012-10-04 02:19:54 +020014021 if (arg->flags & F_ALT)
14022 dtoa_flags = Py_DTSF_ALT;
14023 else
14024 dtoa_flags = 0;
14025 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014026 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014027 return -1;
14028 len = strlen(p);
14029 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014030 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014031 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014032 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014033 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014034 }
14035 else
14036 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014037 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014038 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039}
14040
Victor Stinnerd0880d52012-04-27 23:40:13 +020014041/* formatlong() emulates the format codes d, u, o, x and X, and
14042 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14043 * Python's regular ints.
14044 * Return value: a new PyUnicodeObject*, or NULL if error.
14045 * The output string is of the form
14046 * "-"? ("0x" | "0X")? digit+
14047 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14048 * set in flags. The case of hex digits will be correct,
14049 * There will be at least prec digits, zero-filled on the left if
14050 * necessary to get that many.
14051 * val object to be converted
14052 * flags bitmask of format flags; only F_ALT is looked at
14053 * prec minimum number of digits; 0-fill on left if needed
14054 * type a character in [duoxX]; u acts the same as d
14055 *
14056 * CAUTION: o, x and X conversions on regular ints can never
14057 * produce a '-' sign, but can for Python's unbounded ints.
14058 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014059PyObject *
14060_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014061{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014062 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014064 Py_ssize_t i;
14065 int sign; /* 1 if '-', else 0 */
14066 int len; /* number of characters */
14067 Py_ssize_t llen;
14068 int numdigits; /* len == numnondigits + numdigits */
14069 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014070
Victor Stinnerd0880d52012-04-27 23:40:13 +020014071 /* Avoid exceeding SSIZE_T_MAX */
14072 if (prec > INT_MAX-3) {
14073 PyErr_SetString(PyExc_OverflowError,
14074 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014076 }
14077
14078 assert(PyLong_Check(val));
14079
14080 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014081 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014082 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014083 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014085 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014086 /* int and int subclasses should print numerically when a numeric */
14087 /* format code is used (see issue18780) */
14088 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014089 break;
14090 case 'o':
14091 numnondigits = 2;
14092 result = PyNumber_ToBase(val, 8);
14093 break;
14094 case 'x':
14095 case 'X':
14096 numnondigits = 2;
14097 result = PyNumber_ToBase(val, 16);
14098 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014099 }
14100 if (!result)
14101 return NULL;
14102
14103 assert(unicode_modifiable(result));
14104 assert(PyUnicode_IS_READY(result));
14105 assert(PyUnicode_IS_ASCII(result));
14106
14107 /* To modify the string in-place, there can only be one reference. */
14108 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014109 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014110 PyErr_BadInternalCall();
14111 return NULL;
14112 }
14113 buf = PyUnicode_DATA(result);
14114 llen = PyUnicode_GET_LENGTH(result);
14115 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014116 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014117 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014118 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014119 return NULL;
14120 }
14121 len = (int)llen;
14122 sign = buf[0] == '-';
14123 numnondigits += sign;
14124 numdigits = len - numnondigits;
14125 assert(numdigits > 0);
14126
14127 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014128 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014129 (type == 'o' || type == 'x' || type == 'X'))) {
14130 assert(buf[sign] == '0');
14131 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14132 buf[sign+1] == 'o');
14133 numnondigits -= 2;
14134 buf += 2;
14135 len -= 2;
14136 if (sign)
14137 buf[0] = '-';
14138 assert(len == numnondigits + numdigits);
14139 assert(numdigits > 0);
14140 }
14141
14142 /* Fill with leading zeroes to meet minimum width. */
14143 if (prec > numdigits) {
14144 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14145 numnondigits + prec);
14146 char *b1;
14147 if (!r1) {
14148 Py_DECREF(result);
14149 return NULL;
14150 }
14151 b1 = PyBytes_AS_STRING(r1);
14152 for (i = 0; i < numnondigits; ++i)
14153 *b1++ = *buf++;
14154 for (i = 0; i < prec - numdigits; i++)
14155 *b1++ = '0';
14156 for (i = 0; i < numdigits; i++)
14157 *b1++ = *buf++;
14158 *b1 = '\0';
14159 Py_DECREF(result);
14160 result = r1;
14161 buf = PyBytes_AS_STRING(result);
14162 len = numnondigits + prec;
14163 }
14164
14165 /* Fix up case for hex conversions. */
14166 if (type == 'X') {
14167 /* Need to convert all lower case letters to upper case.
14168 and need to convert 0x to 0X (and -0x to -0X). */
14169 for (i = 0; i < len; i++)
14170 if (buf[i] >= 'a' && buf[i] <= 'x')
14171 buf[i] -= 'a'-'A';
14172 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014173 if (!PyUnicode_Check(result)
14174 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014175 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014176 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014177 Py_DECREF(result);
14178 result = unicode;
14179 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014180 else if (len != PyUnicode_GET_LENGTH(result)) {
14181 if (PyUnicode_Resize(&result, len) < 0)
14182 Py_CLEAR(result);
14183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014184 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014185}
14186
Ethan Furmandf3ed242014-01-05 06:50:30 -080014187/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014188 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014189 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014190 * -1 and raise an exception on error */
14191static int
Victor Stinnera47082312012-10-04 02:19:54 +020014192mainformatlong(PyObject *v,
14193 struct unicode_format_arg_t *arg,
14194 PyObject **p_output,
14195 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014196{
14197 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014198 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014199
14200 if (!PyNumber_Check(v))
14201 goto wrongtype;
14202
Ethan Furman9ab74802014-03-21 06:38:46 -070014203 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014204 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014205 if (type == 'o' || type == 'x' || type == 'X') {
14206 iobj = PyNumber_Index(v);
14207 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014208 if (PyErr_ExceptionMatches(PyExc_TypeError))
14209 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014210 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014211 }
14212 }
14213 else {
14214 iobj = PyNumber_Long(v);
14215 if (iobj == NULL ) {
14216 if (PyErr_ExceptionMatches(PyExc_TypeError))
14217 goto wrongtype;
14218 return -1;
14219 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014220 }
14221 assert(PyLong_Check(iobj));
14222 }
14223 else {
14224 iobj = v;
14225 Py_INCREF(iobj);
14226 }
14227
14228 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014229 && arg->width == -1 && arg->prec == -1
14230 && !(arg->flags & (F_SIGN | F_BLANK))
14231 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014232 {
14233 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014234 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014235 int base;
14236
Victor Stinnera47082312012-10-04 02:19:54 +020014237 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014238 {
14239 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014240 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014241 case 'd':
14242 case 'i':
14243 case 'u':
14244 base = 10;
14245 break;
14246 case 'o':
14247 base = 8;
14248 break;
14249 case 'x':
14250 case 'X':
14251 base = 16;
14252 break;
14253 }
14254
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014255 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14256 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014257 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014258 }
14259 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014260 return 1;
14261 }
14262
Ethan Furmanb95b5612015-01-23 20:05:18 -080014263 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 Py_DECREF(iobj);
14265 if (res == NULL)
14266 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014267 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014268 return 0;
14269
14270wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014271 switch(type)
14272 {
14273 case 'o':
14274 case 'x':
14275 case 'X':
14276 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014277 "%%%c format: an integer is required, "
14278 "not %.200s",
14279 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014280 break;
14281 default:
14282 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014283 "%%%c format: a number is required, "
14284 "not %.200s",
14285 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014286 break;
14287 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 return -1;
14289}
14290
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014291static Py_UCS4
14292formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014293{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014294 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014295 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014296 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014297 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014298 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014299 goto onError;
14300 }
14301 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014302 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014303 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014304 /* make sure number is a type of integer */
14305 if (!PyLong_Check(v)) {
14306 iobj = PyNumber_Index(v);
14307 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014308 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014309 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014310 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014311 Py_DECREF(iobj);
14312 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014313 else {
14314 x = PyLong_AsLong(v);
14315 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014316 if (x == -1 && PyErr_Occurred())
14317 goto onError;
14318
Victor Stinner8faf8212011-12-08 22:14:11 +010014319 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014320 PyErr_SetString(PyExc_OverflowError,
14321 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014322 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014323 }
14324
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014325 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014327
Benjamin Peterson29060642009-01-31 22:14:21 +000014328 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014329 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014330 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014331 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014332}
14333
Victor Stinnera47082312012-10-04 02:19:54 +020014334/* Parse options of an argument: flags, width, precision.
14335 Handle also "%(name)" syntax.
14336
14337 Return 0 if the argument has been formatted into arg->str.
14338 Return 1 if the argument has been written into ctx->writer,
14339 Raise an exception and return -1 on error. */
14340static int
14341unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14342 struct unicode_format_arg_t *arg)
14343{
14344#define FORMAT_READ(ctx) \
14345 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14346
14347 PyObject *v;
14348
Victor Stinnera47082312012-10-04 02:19:54 +020014349 if (arg->ch == '(') {
14350 /* Get argument value from a dictionary. Example: "%(name)s". */
14351 Py_ssize_t keystart;
14352 Py_ssize_t keylen;
14353 PyObject *key;
14354 int pcount = 1;
14355
14356 if (ctx->dict == NULL) {
14357 PyErr_SetString(PyExc_TypeError,
14358 "format requires a mapping");
14359 return -1;
14360 }
14361 ++ctx->fmtpos;
14362 --ctx->fmtcnt;
14363 keystart = ctx->fmtpos;
14364 /* Skip over balanced parentheses */
14365 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14366 arg->ch = FORMAT_READ(ctx);
14367 if (arg->ch == ')')
14368 --pcount;
14369 else if (arg->ch == '(')
14370 ++pcount;
14371 ctx->fmtpos++;
14372 }
14373 keylen = ctx->fmtpos - keystart - 1;
14374 if (ctx->fmtcnt < 0 || pcount > 0) {
14375 PyErr_SetString(PyExc_ValueError,
14376 "incomplete format key");
14377 return -1;
14378 }
14379 key = PyUnicode_Substring(ctx->fmtstr,
14380 keystart, keystart + keylen);
14381 if (key == NULL)
14382 return -1;
14383 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014384 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014385 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014386 }
14387 ctx->args = PyObject_GetItem(ctx->dict, key);
14388 Py_DECREF(key);
14389 if (ctx->args == NULL)
14390 return -1;
14391 ctx->args_owned = 1;
14392 ctx->arglen = -1;
14393 ctx->argidx = -2;
14394 }
14395
14396 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014397 while (--ctx->fmtcnt >= 0) {
14398 arg->ch = FORMAT_READ(ctx);
14399 ctx->fmtpos++;
14400 switch (arg->ch) {
14401 case '-': arg->flags |= F_LJUST; continue;
14402 case '+': arg->flags |= F_SIGN; continue;
14403 case ' ': arg->flags |= F_BLANK; continue;
14404 case '#': arg->flags |= F_ALT; continue;
14405 case '0': arg->flags |= F_ZERO; continue;
14406 }
14407 break;
14408 }
14409
14410 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014411 if (arg->ch == '*') {
14412 v = unicode_format_getnextarg(ctx);
14413 if (v == NULL)
14414 return -1;
14415 if (!PyLong_Check(v)) {
14416 PyErr_SetString(PyExc_TypeError,
14417 "* wants int");
14418 return -1;
14419 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014420 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014421 if (arg->width == -1 && PyErr_Occurred())
14422 return -1;
14423 if (arg->width < 0) {
14424 arg->flags |= F_LJUST;
14425 arg->width = -arg->width;
14426 }
14427 if (--ctx->fmtcnt >= 0) {
14428 arg->ch = FORMAT_READ(ctx);
14429 ctx->fmtpos++;
14430 }
14431 }
14432 else if (arg->ch >= '0' && arg->ch <= '9') {
14433 arg->width = arg->ch - '0';
14434 while (--ctx->fmtcnt >= 0) {
14435 arg->ch = FORMAT_READ(ctx);
14436 ctx->fmtpos++;
14437 if (arg->ch < '0' || arg->ch > '9')
14438 break;
14439 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14440 mixing signed and unsigned comparison. Since arg->ch is between
14441 '0' and '9', casting to int is safe. */
14442 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14443 PyErr_SetString(PyExc_ValueError,
14444 "width too big");
14445 return -1;
14446 }
14447 arg->width = arg->width*10 + (arg->ch - '0');
14448 }
14449 }
14450
14451 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014452 if (arg->ch == '.') {
14453 arg->prec = 0;
14454 if (--ctx->fmtcnt >= 0) {
14455 arg->ch = FORMAT_READ(ctx);
14456 ctx->fmtpos++;
14457 }
14458 if (arg->ch == '*') {
14459 v = unicode_format_getnextarg(ctx);
14460 if (v == NULL)
14461 return -1;
14462 if (!PyLong_Check(v)) {
14463 PyErr_SetString(PyExc_TypeError,
14464 "* wants int");
14465 return -1;
14466 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014467 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014468 if (arg->prec == -1 && PyErr_Occurred())
14469 return -1;
14470 if (arg->prec < 0)
14471 arg->prec = 0;
14472 if (--ctx->fmtcnt >= 0) {
14473 arg->ch = FORMAT_READ(ctx);
14474 ctx->fmtpos++;
14475 }
14476 }
14477 else if (arg->ch >= '0' && arg->ch <= '9') {
14478 arg->prec = arg->ch - '0';
14479 while (--ctx->fmtcnt >= 0) {
14480 arg->ch = FORMAT_READ(ctx);
14481 ctx->fmtpos++;
14482 if (arg->ch < '0' || arg->ch > '9')
14483 break;
14484 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14485 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014486 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014487 return -1;
14488 }
14489 arg->prec = arg->prec*10 + (arg->ch - '0');
14490 }
14491 }
14492 }
14493
14494 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14495 if (ctx->fmtcnt >= 0) {
14496 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14497 if (--ctx->fmtcnt >= 0) {
14498 arg->ch = FORMAT_READ(ctx);
14499 ctx->fmtpos++;
14500 }
14501 }
14502 }
14503 if (ctx->fmtcnt < 0) {
14504 PyErr_SetString(PyExc_ValueError,
14505 "incomplete format");
14506 return -1;
14507 }
14508 return 0;
14509
14510#undef FORMAT_READ
14511}
14512
14513/* Format one argument. Supported conversion specifiers:
14514
14515 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014516 - "i", "d", "u": int or float
14517 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014518 - "e", "E", "f", "F", "g", "G": float
14519 - "c": int or str (1 character)
14520
Victor Stinner8dbd4212012-12-04 09:30:24 +010014521 When possible, the output is written directly into the Unicode writer
14522 (ctx->writer). A string is created when padding is required.
14523
Victor Stinnera47082312012-10-04 02:19:54 +020014524 Return 0 if the argument has been formatted into *p_str,
14525 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014526 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014527static int
14528unicode_format_arg_format(struct unicode_formatter_t *ctx,
14529 struct unicode_format_arg_t *arg,
14530 PyObject **p_str)
14531{
14532 PyObject *v;
14533 _PyUnicodeWriter *writer = &ctx->writer;
14534
14535 if (ctx->fmtcnt == 0)
14536 ctx->writer.overallocate = 0;
14537
Victor Stinnera47082312012-10-04 02:19:54 +020014538 v = unicode_format_getnextarg(ctx);
14539 if (v == NULL)
14540 return -1;
14541
Victor Stinnera47082312012-10-04 02:19:54 +020014542
14543 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014544 case 's':
14545 case 'r':
14546 case 'a':
14547 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14548 /* Fast path */
14549 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14550 return -1;
14551 return 1;
14552 }
14553
14554 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14555 *p_str = v;
14556 Py_INCREF(*p_str);
14557 }
14558 else {
14559 if (arg->ch == 's')
14560 *p_str = PyObject_Str(v);
14561 else if (arg->ch == 'r')
14562 *p_str = PyObject_Repr(v);
14563 else
14564 *p_str = PyObject_ASCII(v);
14565 }
14566 break;
14567
14568 case 'i':
14569 case 'd':
14570 case 'u':
14571 case 'o':
14572 case 'x':
14573 case 'X':
14574 {
14575 int ret = mainformatlong(v, arg, p_str, writer);
14576 if (ret != 0)
14577 return ret;
14578 arg->sign = 1;
14579 break;
14580 }
14581
14582 case 'e':
14583 case 'E':
14584 case 'f':
14585 case 'F':
14586 case 'g':
14587 case 'G':
14588 if (arg->width == -1 && arg->prec == -1
14589 && !(arg->flags & (F_SIGN | F_BLANK)))
14590 {
14591 /* Fast path */
14592 if (formatfloat(v, arg, NULL, writer) == -1)
14593 return -1;
14594 return 1;
14595 }
14596
14597 arg->sign = 1;
14598 if (formatfloat(v, arg, p_str, NULL) == -1)
14599 return -1;
14600 break;
14601
14602 case 'c':
14603 {
14604 Py_UCS4 ch = formatchar(v);
14605 if (ch == (Py_UCS4) -1)
14606 return -1;
14607 if (arg->width == -1 && arg->prec == -1) {
14608 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014609 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014610 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014611 return 1;
14612 }
14613 *p_str = PyUnicode_FromOrdinal(ch);
14614 break;
14615 }
14616
14617 default:
14618 PyErr_Format(PyExc_ValueError,
14619 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014620 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014621 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14622 (int)arg->ch,
14623 ctx->fmtpos - 1);
14624 return -1;
14625 }
14626 if (*p_str == NULL)
14627 return -1;
14628 assert (PyUnicode_Check(*p_str));
14629 return 0;
14630}
14631
14632static int
14633unicode_format_arg_output(struct unicode_formatter_t *ctx,
14634 struct unicode_format_arg_t *arg,
14635 PyObject *str)
14636{
14637 Py_ssize_t len;
14638 enum PyUnicode_Kind kind;
14639 void *pbuf;
14640 Py_ssize_t pindex;
14641 Py_UCS4 signchar;
14642 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014643 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014644 Py_ssize_t sublen;
14645 _PyUnicodeWriter *writer = &ctx->writer;
14646 Py_UCS4 fill;
14647
14648 fill = ' ';
14649 if (arg->sign && arg->flags & F_ZERO)
14650 fill = '0';
14651
14652 if (PyUnicode_READY(str) == -1)
14653 return -1;
14654
14655 len = PyUnicode_GET_LENGTH(str);
14656 if ((arg->width == -1 || arg->width <= len)
14657 && (arg->prec == -1 || arg->prec >= len)
14658 && !(arg->flags & (F_SIGN | F_BLANK)))
14659 {
14660 /* Fast path */
14661 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14662 return -1;
14663 return 0;
14664 }
14665
14666 /* Truncate the string for "s", "r" and "a" formats
14667 if the precision is set */
14668 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14669 if (arg->prec >= 0 && len > arg->prec)
14670 len = arg->prec;
14671 }
14672
14673 /* Adjust sign and width */
14674 kind = PyUnicode_KIND(str);
14675 pbuf = PyUnicode_DATA(str);
14676 pindex = 0;
14677 signchar = '\0';
14678 if (arg->sign) {
14679 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14680 if (ch == '-' || ch == '+') {
14681 signchar = ch;
14682 len--;
14683 pindex++;
14684 }
14685 else if (arg->flags & F_SIGN)
14686 signchar = '+';
14687 else if (arg->flags & F_BLANK)
14688 signchar = ' ';
14689 else
14690 arg->sign = 0;
14691 }
14692 if (arg->width < len)
14693 arg->width = len;
14694
14695 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014696 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014697 if (!(arg->flags & F_LJUST)) {
14698 if (arg->sign) {
14699 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014700 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014701 }
14702 else {
14703 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014704 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014705 }
14706 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014707 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14708 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014709 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014710 }
14711
Victor Stinnera47082312012-10-04 02:19:54 +020014712 buflen = arg->width;
14713 if (arg->sign && len == arg->width)
14714 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014715 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014716 return -1;
14717
14718 /* Write the sign if needed */
14719 if (arg->sign) {
14720 if (fill != ' ') {
14721 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14722 writer->pos += 1;
14723 }
14724 if (arg->width > len)
14725 arg->width--;
14726 }
14727
14728 /* Write the numeric prefix for "x", "X" and "o" formats
14729 if the alternate form is used.
14730 For example, write "0x" for the "%#x" format. */
14731 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14732 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14733 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14734 if (fill != ' ') {
14735 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14736 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14737 writer->pos += 2;
14738 pindex += 2;
14739 }
14740 arg->width -= 2;
14741 if (arg->width < 0)
14742 arg->width = 0;
14743 len -= 2;
14744 }
14745
14746 /* Pad left with the fill character if needed */
14747 if (arg->width > len && !(arg->flags & F_LJUST)) {
14748 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014749 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014750 writer->pos += sublen;
14751 arg->width = len;
14752 }
14753
14754 /* If padding with spaces: write sign if needed and/or numeric prefix if
14755 the alternate form is used */
14756 if (fill == ' ') {
14757 if (arg->sign) {
14758 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14759 writer->pos += 1;
14760 }
14761 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14762 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14763 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14764 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14765 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14766 writer->pos += 2;
14767 pindex += 2;
14768 }
14769 }
14770
14771 /* Write characters */
14772 if (len) {
14773 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14774 str, pindex, len);
14775 writer->pos += len;
14776 }
14777
14778 /* Pad right with the fill character if needed */
14779 if (arg->width > len) {
14780 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014781 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014782 writer->pos += sublen;
14783 }
14784 return 0;
14785}
14786
14787/* Helper of PyUnicode_Format(): format one arg.
14788 Return 0 on success, raise an exception and return -1 on error. */
14789static int
14790unicode_format_arg(struct unicode_formatter_t *ctx)
14791{
14792 struct unicode_format_arg_t arg;
14793 PyObject *str;
14794 int ret;
14795
Victor Stinner8dbd4212012-12-04 09:30:24 +010014796 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014797 if (arg.ch == '%') {
14798 ctx->fmtpos++;
14799 ctx->fmtcnt--;
14800 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14801 return -1;
14802 return 0;
14803 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014804 arg.flags = 0;
14805 arg.width = -1;
14806 arg.prec = -1;
14807 arg.sign = 0;
14808 str = NULL;
14809
Victor Stinnera47082312012-10-04 02:19:54 +020014810 ret = unicode_format_arg_parse(ctx, &arg);
14811 if (ret == -1)
14812 return -1;
14813
14814 ret = unicode_format_arg_format(ctx, &arg, &str);
14815 if (ret == -1)
14816 return -1;
14817
14818 if (ret != 1) {
14819 ret = unicode_format_arg_output(ctx, &arg, str);
14820 Py_DECREF(str);
14821 if (ret == -1)
14822 return -1;
14823 }
14824
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014825 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014826 PyErr_SetString(PyExc_TypeError,
14827 "not all arguments converted during string formatting");
14828 return -1;
14829 }
14830 return 0;
14831}
14832
Alexander Belopolsky40018472011-02-26 01:02:56 +000014833PyObject *
14834PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014835{
Victor Stinnera47082312012-10-04 02:19:54 +020014836 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014837
Guido van Rossumd57fd912000-03-10 22:53:23 +000014838 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014839 PyErr_BadInternalCall();
14840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014841 }
Victor Stinnera47082312012-10-04 02:19:54 +020014842
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014843 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014844 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014845
14846 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014847 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14848 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14849 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14850 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014851
Victor Stinner8f674cc2013-04-17 23:02:17 +020014852 _PyUnicodeWriter_Init(&ctx.writer);
14853 ctx.writer.min_length = ctx.fmtcnt + 100;
14854 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014855
Guido van Rossumd57fd912000-03-10 22:53:23 +000014856 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014857 ctx.arglen = PyTuple_Size(args);
14858 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014859 }
14860 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014861 ctx.arglen = -1;
14862 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014863 }
Victor Stinnera47082312012-10-04 02:19:54 +020014864 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014865 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014866 ctx.dict = args;
14867 else
14868 ctx.dict = NULL;
14869 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014870
Victor Stinnera47082312012-10-04 02:19:54 +020014871 while (--ctx.fmtcnt >= 0) {
14872 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014873 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014874
14875 nonfmtpos = ctx.fmtpos++;
14876 while (ctx.fmtcnt >= 0 &&
14877 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14878 ctx.fmtpos++;
14879 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 }
Victor Stinnera47082312012-10-04 02:19:54 +020014881 if (ctx.fmtcnt < 0) {
14882 ctx.fmtpos--;
14883 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014884 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014885
Victor Stinnercfc4c132013-04-03 01:48:39 +020014886 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14887 nonfmtpos, ctx.fmtpos) < 0)
14888 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014889 }
14890 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014891 ctx.fmtpos++;
14892 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014893 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014894 }
14895 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014896
Victor Stinnera47082312012-10-04 02:19:54 +020014897 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014898 PyErr_SetString(PyExc_TypeError,
14899 "not all arguments converted during string formatting");
14900 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014901 }
14902
Victor Stinnera47082312012-10-04 02:19:54 +020014903 if (ctx.args_owned) {
14904 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014905 }
Victor Stinnera47082312012-10-04 02:19:54 +020014906 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014907
Benjamin Peterson29060642009-01-31 22:14:21 +000014908 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014909 _PyUnicodeWriter_Dealloc(&ctx.writer);
14910 if (ctx.args_owned) {
14911 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014912 }
14913 return NULL;
14914}
14915
Jeremy Hylton938ace62002-07-17 16:30:39 +000014916static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14918
Tim Peters6d6c1a32001-08-02 04:15:00 +000014919static PyObject *
14920unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14921{
Benjamin Peterson29060642009-01-31 22:14:21 +000014922 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014923 static char *kwlist[] = {"object", "encoding", "errors", 0};
14924 char *encoding = NULL;
14925 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014926
Benjamin Peterson14339b62009-01-31 16:36:08 +000014927 if (type != &PyUnicode_Type)
14928 return unicode_subtype_new(type, args, kwds);
14929 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014930 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014931 return NULL;
14932 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014933 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014934 if (encoding == NULL && errors == NULL)
14935 return PyObject_Str(x);
14936 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014937 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014938}
14939
Guido van Rossume023fe02001-08-30 03:12:59 +000014940static PyObject *
14941unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14942{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014943 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014944 Py_ssize_t length, char_size;
14945 int share_wstr, share_utf8;
14946 unsigned int kind;
14947 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014948
Benjamin Peterson14339b62009-01-31 16:36:08 +000014949 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014950
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014951 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014952 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014953 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014954 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014955 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014956 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014957 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014958 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014959
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014960 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014961 if (self == NULL) {
14962 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 return NULL;
14964 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014965 kind = PyUnicode_KIND(unicode);
14966 length = PyUnicode_GET_LENGTH(unicode);
14967
14968 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014969#ifdef Py_DEBUG
14970 _PyUnicode_HASH(self) = -1;
14971#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014972 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014973#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014974 _PyUnicode_STATE(self).interned = 0;
14975 _PyUnicode_STATE(self).kind = kind;
14976 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014977 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014978 _PyUnicode_STATE(self).ready = 1;
14979 _PyUnicode_WSTR(self) = NULL;
14980 _PyUnicode_UTF8_LENGTH(self) = 0;
14981 _PyUnicode_UTF8(self) = NULL;
14982 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014983 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014984
14985 share_utf8 = 0;
14986 share_wstr = 0;
14987 if (kind == PyUnicode_1BYTE_KIND) {
14988 char_size = 1;
14989 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14990 share_utf8 = 1;
14991 }
14992 else if (kind == PyUnicode_2BYTE_KIND) {
14993 char_size = 2;
14994 if (sizeof(wchar_t) == 2)
14995 share_wstr = 1;
14996 }
14997 else {
14998 assert(kind == PyUnicode_4BYTE_KIND);
14999 char_size = 4;
15000 if (sizeof(wchar_t) == 4)
15001 share_wstr = 1;
15002 }
15003
15004 /* Ensure we won't overflow the length. */
15005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15006 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015007 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015009 data = PyObject_MALLOC((length + 1) * char_size);
15010 if (data == NULL) {
15011 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015012 goto onError;
15013 }
15014
Victor Stinnerc3c74152011-10-02 20:39:55 +020015015 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015016 if (share_utf8) {
15017 _PyUnicode_UTF8_LENGTH(self) = length;
15018 _PyUnicode_UTF8(self) = data;
15019 }
15020 if (share_wstr) {
15021 _PyUnicode_WSTR_LENGTH(self) = length;
15022 _PyUnicode_WSTR(self) = (wchar_t *)data;
15023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015024
Christian Heimesf051e432016-09-13 20:22:02 +020015025 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015026 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015027 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015028#ifdef Py_DEBUG
15029 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15030#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015031 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015032 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033
15034onError:
15035 Py_DECREF(unicode);
15036 Py_DECREF(self);
15037 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015038}
15039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015040PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015041"str(object='') -> str\n\
15042str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015043\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015044Create a new string object from the given object. If encoding or\n\
15045errors is specified, then the object must expose a data buffer\n\
15046that will be decoded using the given encoding and error handler.\n\
15047Otherwise, returns the result of object.__str__() (if defined)\n\
15048or repr(object).\n\
15049encoding defaults to sys.getdefaultencoding().\n\
15050errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015051
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015052static PyObject *unicode_iter(PyObject *seq);
15053
Guido van Rossumd57fd912000-03-10 22:53:23 +000015054PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015055 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015056 "str", /* tp_name */
15057 sizeof(PyUnicodeObject), /* tp_basicsize */
15058 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015060 (destructor)unicode_dealloc, /* tp_dealloc */
15061 0, /* tp_print */
15062 0, /* tp_getattr */
15063 0, /* tp_setattr */
15064 0, /* tp_reserved */
15065 unicode_repr, /* tp_repr */
15066 &unicode_as_number, /* tp_as_number */
15067 &unicode_as_sequence, /* tp_as_sequence */
15068 &unicode_as_mapping, /* tp_as_mapping */
15069 (hashfunc) unicode_hash, /* tp_hash*/
15070 0, /* tp_call*/
15071 (reprfunc) unicode_str, /* tp_str */
15072 PyObject_GenericGetAttr, /* tp_getattro */
15073 0, /* tp_setattro */
15074 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015076 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15077 unicode_doc, /* tp_doc */
15078 0, /* tp_traverse */
15079 0, /* tp_clear */
15080 PyUnicode_RichCompare, /* tp_richcompare */
15081 0, /* tp_weaklistoffset */
15082 unicode_iter, /* tp_iter */
15083 0, /* tp_iternext */
15084 unicode_methods, /* tp_methods */
15085 0, /* tp_members */
15086 0, /* tp_getset */
15087 &PyBaseObject_Type, /* tp_base */
15088 0, /* tp_dict */
15089 0, /* tp_descr_get */
15090 0, /* tp_descr_set */
15091 0, /* tp_dictoffset */
15092 0, /* tp_init */
15093 0, /* tp_alloc */
15094 unicode_new, /* tp_new */
15095 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015096};
15097
15098/* Initialize the Unicode implementation */
15099
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015100_PyInitError
15101_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015102{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015103 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015104 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015105 0x000A, /* LINE FEED */
15106 0x000D, /* CARRIAGE RETURN */
15107 0x001C, /* FILE SEPARATOR */
15108 0x001D, /* GROUP SEPARATOR */
15109 0x001E, /* RECORD SEPARATOR */
15110 0x0085, /* NEXT LINE */
15111 0x2028, /* LINE SEPARATOR */
15112 0x2029, /* PARAGRAPH SEPARATOR */
15113 };
15114
Fred Drakee4315f52000-05-09 19:53:39 +000015115 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015116 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015117 if (!unicode_empty) {
15118 return _Py_INIT_ERR("Can't create empty string");
15119 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015120 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015121
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015122 if (PyType_Ready(&PyUnicode_Type) < 0) {
15123 return _Py_INIT_ERR("Can't initialize unicode type");
15124 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015125
15126 /* initialize the linebreak bloom filter */
15127 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015129 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015130
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015131 if (PyType_Ready(&EncodingMapType) < 0) {
15132 return _Py_INIT_ERR("Can't initialize encoding map type");
15133 }
15134 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15135 return _Py_INIT_ERR("Can't initialize field name iterator type");
15136 }
15137 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15138 return _Py_INIT_ERR("Can't initialize formatter iter type");
15139 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015140 return _Py_INIT_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015141}
15142
15143/* Finalize the Unicode implementation */
15144
Christian Heimesa156e092008-02-16 07:38:31 +000015145int
15146PyUnicode_ClearFreeList(void)
15147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015148 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015149}
15150
Guido van Rossumd57fd912000-03-10 22:53:23 +000015151void
Thomas Wouters78890102000-07-22 19:25:51 +000015152_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015153{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015154 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015155
Serhiy Storchaka05997252013-01-26 12:14:02 +020015156 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015157
Serhiy Storchaka05997252013-01-26 12:14:02 +020015158 for (i = 0; i < 256; i++)
15159 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015160 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015161 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015162}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015163
Walter Dörwald16807132007-05-25 13:52:07 +000015164void
15165PyUnicode_InternInPlace(PyObject **p)
15166{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015167 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015169#ifdef Py_DEBUG
15170 assert(s != NULL);
15171 assert(_PyUnicode_CHECK(s));
15172#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015174 return;
15175#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 /* If it's a subclass, we don't really know what putting
15177 it in the interned dict might do. */
15178 if (!PyUnicode_CheckExact(s))
15179 return;
15180 if (PyUnicode_CHECK_INTERNED(s))
15181 return;
15182 if (interned == NULL) {
15183 interned = PyDict_New();
15184 if (interned == NULL) {
15185 PyErr_Clear(); /* Don't leave an exception */
15186 return;
15187 }
15188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015189 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015190 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015191 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015192 if (t == NULL) {
15193 PyErr_Clear();
15194 return;
15195 }
15196 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015197 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015198 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015199 return;
15200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 /* The two references in interned are not counted by refcnt.
15202 The deallocator will take care of this */
15203 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015204 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015205}
15206
15207void
15208PyUnicode_InternImmortal(PyObject **p)
15209{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015210 PyUnicode_InternInPlace(p);
15211 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015212 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 Py_INCREF(*p);
15214 }
Walter Dörwald16807132007-05-25 13:52:07 +000015215}
15216
15217PyObject *
15218PyUnicode_InternFromString(const char *cp)
15219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015220 PyObject *s = PyUnicode_FromString(cp);
15221 if (s == NULL)
15222 return NULL;
15223 PyUnicode_InternInPlace(&s);
15224 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015225}
15226
Alexander Belopolsky40018472011-02-26 01:02:56 +000015227void
15228_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015229{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015230 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015231 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 Py_ssize_t i, n;
15233 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015234
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 if (interned == NULL || !PyDict_Check(interned))
15236 return;
15237 keys = PyDict_Keys(interned);
15238 if (keys == NULL || !PyList_Check(keys)) {
15239 PyErr_Clear();
15240 return;
15241 }
Walter Dörwald16807132007-05-25 13:52:07 +000015242
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15244 detector, interned unicode strings are not forcibly deallocated;
15245 rather, we give them their stolen references back, and then clear
15246 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015247
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 n = PyList_GET_SIZE(keys);
15249 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015250 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015252 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015253 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015254 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015256 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 case SSTATE_NOT_INTERNED:
15258 /* XXX Shouldn't happen */
15259 break;
15260 case SSTATE_INTERNED_IMMORTAL:
15261 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015262 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 break;
15264 case SSTATE_INTERNED_MORTAL:
15265 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015266 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 break;
15268 default:
15269 Py_FatalError("Inconsistent interned string state.");
15270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015271 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 }
15273 fprintf(stderr, "total size of all interned strings: "
15274 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15275 "mortal/immortal\n", mortal_size, immortal_size);
15276 Py_DECREF(keys);
15277 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015278 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015279}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015280
15281
15282/********************* Unicode Iterator **************************/
15283
15284typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015285 PyObject_HEAD
15286 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015287 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015288} unicodeiterobject;
15289
15290static void
15291unicodeiter_dealloc(unicodeiterobject *it)
15292{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 _PyObject_GC_UNTRACK(it);
15294 Py_XDECREF(it->it_seq);
15295 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015296}
15297
15298static int
15299unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15300{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 Py_VISIT(it->it_seq);
15302 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015303}
15304
15305static PyObject *
15306unicodeiter_next(unicodeiterobject *it)
15307{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015308 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 assert(it != NULL);
15311 seq = it->it_seq;
15312 if (seq == NULL)
15313 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015314 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015316 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15317 int kind = PyUnicode_KIND(seq);
15318 void *data = PyUnicode_DATA(seq);
15319 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15320 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 if (item != NULL)
15322 ++it->it_index;
15323 return item;
15324 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015325
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015327 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015329}
15330
15331static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015332unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 Py_ssize_t len = 0;
15335 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015336 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015338}
15339
15340PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15341
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015342static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015343unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015344{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015345 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015346 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015347 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015348 it->it_seq, it->it_index);
15349 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015350 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015351 if (u == NULL)
15352 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015353 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015354 }
15355}
15356
15357PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15358
15359static PyObject *
15360unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15361{
15362 Py_ssize_t index = PyLong_AsSsize_t(state);
15363 if (index == -1 && PyErr_Occurred())
15364 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015365 if (it->it_seq != NULL) {
15366 if (index < 0)
15367 index = 0;
15368 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15369 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15370 it->it_index = index;
15371 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015372 Py_RETURN_NONE;
15373}
15374
15375PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15376
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015379 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015380 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15381 reduce_doc},
15382 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15383 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015384 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015385};
15386
15387PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15389 "str_iterator", /* tp_name */
15390 sizeof(unicodeiterobject), /* tp_basicsize */
15391 0, /* tp_itemsize */
15392 /* methods */
15393 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15394 0, /* tp_print */
15395 0, /* tp_getattr */
15396 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015397 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 0, /* tp_repr */
15399 0, /* tp_as_number */
15400 0, /* tp_as_sequence */
15401 0, /* tp_as_mapping */
15402 0, /* tp_hash */
15403 0, /* tp_call */
15404 0, /* tp_str */
15405 PyObject_GenericGetAttr, /* tp_getattro */
15406 0, /* tp_setattro */
15407 0, /* tp_as_buffer */
15408 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15409 0, /* tp_doc */
15410 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15411 0, /* tp_clear */
15412 0, /* tp_richcompare */
15413 0, /* tp_weaklistoffset */
15414 PyObject_SelfIter, /* tp_iter */
15415 (iternextfunc)unicodeiter_next, /* tp_iternext */
15416 unicodeiter_methods, /* tp_methods */
15417 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418};
15419
15420static PyObject *
15421unicode_iter(PyObject *seq)
15422{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424
Benjamin Peterson14339b62009-01-31 16:36:08 +000015425 if (!PyUnicode_Check(seq)) {
15426 PyErr_BadInternalCall();
15427 return NULL;
15428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015429 if (PyUnicode_READY(seq) == -1)
15430 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15432 if (it == NULL)
15433 return NULL;
15434 it->it_index = 0;
15435 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015436 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 _PyObject_GC_TRACK(it);
15438 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015439}
15440
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015441
15442size_t
15443Py_UNICODE_strlen(const Py_UNICODE *u)
15444{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015445 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015446}
15447
15448Py_UNICODE*
15449Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15450{
15451 Py_UNICODE *u = s1;
15452 while ((*u++ = *s2++));
15453 return s1;
15454}
15455
15456Py_UNICODE*
15457Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15458{
15459 Py_UNICODE *u = s1;
15460 while ((*u++ = *s2++))
15461 if (n-- == 0)
15462 break;
15463 return s1;
15464}
15465
15466Py_UNICODE*
15467Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15468{
15469 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015470 u1 += wcslen(u1);
15471 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015472 return s1;
15473}
15474
15475int
15476Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15477{
15478 while (*s1 && *s2 && *s1 == *s2)
15479 s1++, s2++;
15480 if (*s1 && *s2)
15481 return (*s1 < *s2) ? -1 : +1;
15482 if (*s1)
15483 return 1;
15484 if (*s2)
15485 return -1;
15486 return 0;
15487}
15488
15489int
15490Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15491{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015492 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015493 for (; n != 0; n--) {
15494 u1 = *s1;
15495 u2 = *s2;
15496 if (u1 != u2)
15497 return (u1 < u2) ? -1 : +1;
15498 if (u1 == '\0')
15499 return 0;
15500 s1++;
15501 s2++;
15502 }
15503 return 0;
15504}
15505
15506Py_UNICODE*
15507Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15508{
15509 const Py_UNICODE *p;
15510 for (p = s; *p; p++)
15511 if (*p == c)
15512 return (Py_UNICODE*)p;
15513 return NULL;
15514}
15515
15516Py_UNICODE*
15517Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15518{
15519 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015520 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015521 while (p != s) {
15522 p--;
15523 if (*p == c)
15524 return (Py_UNICODE*)p;
15525 }
15526 return NULL;
15527}
Victor Stinner331ea922010-08-10 16:37:20 +000015528
Victor Stinner71133ff2010-09-01 23:43:53 +000015529Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015530PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015531{
Victor Stinner577db2c2011-10-11 22:12:48 +020015532 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015533 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015535 if (!PyUnicode_Check(unicode)) {
15536 PyErr_BadArgument();
15537 return NULL;
15538 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015539 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015540 if (u == NULL)
15541 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015542 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015543 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015544 PyErr_NoMemory();
15545 return NULL;
15546 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015547 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015548 size *= sizeof(Py_UNICODE);
15549 copy = PyMem_Malloc(size);
15550 if (copy == NULL) {
15551 PyErr_NoMemory();
15552 return NULL;
15553 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015554 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015555 return copy;
15556}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015557
Georg Brandl66c221e2010-10-14 07:04:07 +000015558/* A _string module, to export formatter_parser and formatter_field_name_split
15559 to the string.Formatter class implemented in Python. */
15560
15561static PyMethodDef _string_methods[] = {
15562 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15563 METH_O, PyDoc_STR("split the argument as a field name")},
15564 {"formatter_parser", (PyCFunction) formatter_parser,
15565 METH_O, PyDoc_STR("parse the argument as a format string")},
15566 {NULL, NULL}
15567};
15568
15569static struct PyModuleDef _string_module = {
15570 PyModuleDef_HEAD_INIT,
15571 "_string",
15572 PyDoc_STR("string helper module"),
15573 0,
15574 _string_methods,
15575 NULL,
15576 NULL,
15577 NULL,
15578 NULL
15579};
15580
15581PyMODINIT_FUNC
15582PyInit__string(void)
15583{
15584 return PyModule_Create(&_string_module);
15585}
15586
15587
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015588#ifdef __cplusplus
15589}
15590#endif